In [1]:
import pandas as pd
from itertools import combinations

In [2]:
base = "./data/"

order_products_train_df = pd.read_csv(base + "order_products__train.csv")
order_products_prior_df = pd.read_csv(base + "order_products__prior.csv")
# orders_df = pd.read_csv(base + "orders.csv")
products_df = pd.read_csv(base + "products.csv")
# aisles_df = pd.read_csv(base + "aisles.csv")
# departments_df = pd.read_csv(base + "departments.csv")

In [3]:
order_products_train_df = order_products_train_df[["order_id", "product_id"]]
order_products_train_df.head()

Unnamed: 0,order_id,product_id
0,1,49302
1,1,11109
2,1,10246
3,1,49683
4,1,43633


In [4]:
order_products_prior_df = order_products_prior_df[["order_id", "product_id"]]
order_products_prior_df.head()

Unnamed: 0,order_id,product_id
0,2,33120
1,2,28985
2,2,9327
3,2,45918
4,2,30035


In [5]:
order_products_df = pd.concat([order_products_train_df, order_products_prior_df])
order_products_df = order_products_df[order_products_df["order_id"] <= 100000]
order_products_df.sort_values("order_id", inplace=True, ignore_index=True)
order_products_df

Unnamed: 0,order_id,product_id
0,1,49302
1,1,11109
2,1,10246
3,1,49683
4,1,43633
...,...,...
987254,100000,30169
987255,100000,38734
987256,100000,36759
987257,100000,31506


In [6]:
products_in_transactions = order_products_df.groupby("product_id").count().rename(columns={"order_id": "no_of_transactions"}).reset_index()
products_in_transactions.sort_values("no_of_transactions", ascending=False).reset_index(drop=True)

Unnamed: 0,product_id,no_of_transactions
0,24852,14494
1,13176,11694
2,21137,8081
3,21903,7369
4,47209,6411
...,...,...
35065,28078,1
35066,28077,1
35067,38283,1
35068,38285,1


In [7]:
order_products_df = order_products_df.merge(products_df[['product_id', 'product_name']], on='product_id')
order_products_df.sort_values("order_id", inplace=True, ignore_index=True)

In [8]:
order_products_df.head()

Unnamed: 0,order_id,product_id,product_name
0,1,49302,Bulgarian Yogurt
1,1,10246,Organic Celery Hearts
2,1,49683,Cucumber Kirby
3,1,47209,Organic Hass Avocado
4,1,22035,Organic Whole String Cheese


In [9]:
order_products_df = order_products_df.groupby("order_id")['product_name'].agg(lambda x: ', '.join(x)).reset_index()

In [10]:
order_products_df.head()

Unnamed: 0,order_id,product_name
0,1,"Bulgarian Yogurt, Organic Celery Hearts, Cucum..."
1,2,"Michigan Organic Kale, Garlic Powder, Coconut ..."
2,3,"Organic Ginger Root, Air Chilled Organic Bonel..."
3,4,"Goldfish Cheddar Baked Snack Crackers, Sugarfr..."
4,5,"American Slices Cheese, Artichokes, 2% Reduced..."


In [11]:
transactions = order_products_df['product_name'].str.split(', ')

In [12]:
transactions

0        [Bulgarian Yogurt, Organic Celery Hearts, Cucu...
1        [Michigan Organic Kale, Garlic Powder, Coconut...
2        [Organic Ginger Root, Air Chilled Organic Bone...
3        [Goldfish Cheddar Baked Snack Crackers, Sugarf...
4        [American Slices Cheese, Artichokes, 2% Reduce...
                               ...                        
97828    [Bag of Large Lemons, Broccoli Florettes, Orga...
97829    [Organic Grapes Galore Bunny Snacks, Organic B...
97830                  [Healthy Trinity, 3 in 1, Capsules]
97831    [Grainiac Organic Bread, Creamy Peanut Butter,...
97832    [Original Hummus, Gala Apples, Corn Tortillas,...
Name: product_name, Length: 97833, dtype: object

In [13]:
unique_items = set(item for transaction in transactions for item in transaction)

In [14]:
unique_items

{'Pure Squeezed Calcium & Vitamin D No Pulp Orange Juice',
 'Maple Sausage Links',
 'Reduced Fat Sliced Swiss Cheese',
 'Savory Turkey Breakfast Sausage Patties',
 'Eye Allergy Relief Eye Drops',
 'Apple Blueberry Baby Food',
 'Gluten Free Corn Tortilla Taquitos Vegan Beef Style',
 '6 Cheese Italian Shredded Cheese',
 'Soybeans in Pods Edamame',
 'Reduced Fat Original Baked Snack Crackers',
 'Egg Noodles',
 'French Lavender All-Purpose Natural Surface Cleaner',
 'Margherita Crispy Thin Crust Pizza',
 'Dark Chocolate Peanut Butter Cups',
 'Cookies And Cream Ice Cream',
 'Hydro Boost Water Gel Facial Moisturizer',
 'Orange Cream Total Omega 3-6-9 Supplement',
 'Unsweetened Almond Milk',
 'Strawberry',
 'Indulgent Coconut Milk Body Wash',
 "Devil's Food Zingers",
 'Fruit By the Foot Variety Pack',
 'Original Malt Vinegar',
 'Chopped Ripe Olives',
 'Baked Beans with Sweet Sorghum',
 "General Tso's Chicken",
 'Mexican Roasted Pasilla Chile Cooking Sauce',
 'Ultra Blue Energy Drink',
 'Cedar

In [15]:
def get_support(itemset, transactions):
    count = transactions.apply(lambda transaction: itemset.issubset(transaction)).sum()
    return count / len(transactions)

def get_confidence(itemset, antecedent, transactions):
    support_antecedent = get_support(antecedent, transactions)
    confidence = itemset["support"] / support_antecedent
    return confidence, support_antecedent

In [16]:
min_support = 0.005
min_confidence = 0.005

In [17]:
# frequent_itemsets = []
# for item in unique_items:
#     support = get_support({item}, transactions)
#     if support >= min_support:
#         frequent_itemsets.append({"itemset": item, "support": support, "k": 1})

In [18]:
# unique_items = set()
# for item_set in frequent_itemsets:
#     for item in item_set:
#         unique_items.add(item)

In [19]:
k = 1
frequent_itemsets = []
while unique_items:
    for item_pair in combinations(unique_items, k):
        support = get_support(set(item_pair), transactions)
        if support >= min_support:
            frequent_itemsets.append({"itemset": item_pair, "support": support, "k": k})
    
    unique_items = set()
    for item_set in frequent_itemsets:
        if item_set["k"] == k:
            for item in item_set["itemset"]:
                unique_items.add(item)
    
    k += 1
    print("k -->", k, (unique_items))

k --> 2 {'Blackberries', 'Garlic', 'Organic Black Beans', 'Honeycrisp Apple', 'Organic Butternut Squash', 'Organic Mint', 'Gala Apples', 'Organic Extra Firm Tofu', 'Organic Thyme', 'Green Beans', 'Organic Tomato Basil Pasta Sauce', 'Organic Tomato Paste', 'Original Veggie Straws', 'Carrots', 'Total 2% Lowfat Plain Greek Yogurt', 'Air Chilled Organic Boneless Skinless Chicken Breasts', 'Jalapeno Peppers', 'Bunch', 'Organic Green Cabbage', 'Frozen Organic Wild Blueberries', 'Organic Russet Potato', 'Organic Celery Hearts', 'Yogurt', 'Creamy Almond Butter', 'Grape White/Green Seedless', 'Organic Frozen Peas', 'Organic Baby Spinach', 'Organic 2% Reduced Fat Milk', 'Organic Jalapeno Pepper', 'Pure Irish Butter', 'Organic Kiwi', 'Organic Whole String Cheese', 'Strawberry', 'Original Hummus', 'Organic Lemon', 'Unsweetened Original Almond Breeze Almond Milk', 'Trilogy Kombucha Drink', 'Organic Garlic', 'Sparkling Water Berry', 'Broccoli Crown', 'Total 2% All Natural Low Fat 2% Milkfat Greek St

In [20]:
df = pd.DataFrame(frequent_itemsets)
df

Unnamed: 0,itemset,support,k
0,"(Strawberry,)",0.012020,1
1,"(Organic Peeled Whole Baby Carrots,)",0.017203,1
2,"(Shredded Mozzarella,)",0.006634,1
3,"(Organic Carrot Bunch,)",0.010641,1
4,"(Red Onion,)",0.012797,1
...,...,...,...
341,"(Organic Strawberries, Organic Hass Avocado)",0.012307,2
342,"(Organic Strawberries, Organic Avocado)",0.007472,2
343,"(Organic Raspberries, Organic Hass Avocado)",0.007850,2
344,"(Organic Hass Avocado, Organic Zucchini)",0.005274,2


In [21]:
df[df["k"] == 3]

Unnamed: 0,itemset,support,k
345,"(Vitamin D, Organic, Milk)",0.00646,3


In [22]:
def generate_association_rules(frequent_itemsets, min_confidence, transactions):
    rules = []
    for itemset in frequent_itemsets:
        for size in range(1, len(itemset["itemset"])):
            for subset in combinations(itemset["itemset"], size):
                antecedent = set(subset)
                consequent = set(itemset["itemset"]) - antecedent
                confidence, support_antecedent = get_confidence(itemset, antecedent, transactions)
                if confidence >= min_confidence:
                    rules.append({"antecedent": antecedent, "consequent": consequent, "support_A": support_antecedent, "support_itemset": itemset["support"], "confidence": confidence})
    return rules


In [23]:
rules = generate_association_rules(frequent_itemsets, min_confidence, transactions)

In [24]:
rules

[{'antecedent': {'Honeycrisp Apple'},
  'consequent': {'Banana'},
  'support_A': 0.02521644026044382,
  'support_itemset': 0.008984698414645364,
  'confidence': 0.3563032022699635},
 {'antecedent': {'Banana'},
  'consequent': {'Honeycrisp Apple'},
  'support_A': 0.14871260208723028,
  'support_itemset': 0.008984698414645364,
  'confidence': 0.060416523472403595},
 {'antecedent': {'Carrots'},
  'consequent': {'Banana'},
  'support_A': 0.02326413377899073,
  'support_itemset': 0.005110749951447876,
  'confidence': 0.21968365553602814},
 {'antecedent': {'Banana'},
  'consequent': {'Carrots'},
  'support_A': 0.14871260208723028,
  'support_itemset': 0.005110749951447876,
  'confidence': 0.034366623135610695},
 {'antecedent': {'Bunch'},
  'consequent': {'Organic Red Radish'},
  'support_A': 0.014279435364345365,
  'support_itemset': 0.008739382416975867,
  'confidence': 0.6120257695060844},
 {'antecedent': {'Organic Red Radish'},
  'consequent': {'Bunch'},
  'support_A': 0.00873938241697586

In [26]:
association_rules_df = pd.DataFrame(rules)

In [29]:
association_rules_df.to_csv("association_rules_1lakh_transactions.csv", index=False)