# Import Libraries

In [1]:
import pandas as pd
from itertools import combinations
from tqdm import tqdm

# Load the data

In [2]:
base = "./data/"

order_products_train_df = pd.read_csv(base + "order_products__train.csv")
order_products_prior_df = pd.read_csv(base + "order_products__prior.csv")
products_df = pd.read_csv(base + "products.csv")

#### Explore the data

In [3]:
order_products_train_df = order_products_train_df[["order_id", "product_id"]]
order_products_train_df.head()

Unnamed: 0,order_id,product_id
0,1,49302
1,1,11109
2,1,10246
3,1,49683
4,1,43633


In [4]:
order_products_prior_df = order_products_prior_df[["order_id", "product_id"]]
order_products_prior_df.head()

Unnamed: 0,order_id,product_id
0,2,33120
1,2,28985
2,2,9327
3,2,45918
4,2,30035


# Preprocess the data

In [5]:
order_products_df = pd.concat([order_products_train_df, order_products_prior_df])
order_products_df = order_products_df[order_products_df["order_id"] <= 100000]
order_products_df.sort_values("order_id", inplace=True, ignore_index=True)
order_products_df

Unnamed: 0,order_id,product_id
0,1,49302
1,1,11109
2,1,10246
3,1,49683
4,1,43633
...,...,...
987254,100000,30169
987255,100000,38734
987256,100000,36759
987257,100000,31506


In [6]:
products_in_transactions = order_products_df.groupby("product_id").count().rename(columns={"order_id": "no_of_transactions"}).reset_index()
products_in_transactions.sort_values("no_of_transactions", ascending=False).reset_index(drop=True)

Unnamed: 0,product_id,no_of_transactions
0,24852,14494
1,13176,11694
2,21137,8081
3,21903,7369
4,47209,6411
...,...,...
35065,28078,1
35066,28077,1
35067,38283,1
35068,38285,1


#### merge orders and products

In [7]:
order_products_df = order_products_df.merge(products_df[['product_id', 'product_name']], on='product_id')
order_products_df.sort_values("order_id", inplace=True, ignore_index=True)

In [8]:
order_products_df.head()

Unnamed: 0,order_id,product_id,product_name
0,1,49302,Bulgarian Yogurt
1,1,10246,Organic Celery Hearts
2,1,49683,Cucumber Kirby
3,1,47209,Organic Hass Avocado
4,1,22035,Organic Whole String Cheese


#### Aggregate orders

In [9]:
order_products_df = order_products_df.groupby("order_id")['product_name'].agg(lambda x: ', '.join(x)).reset_index()
order_products_df.head()

Unnamed: 0,order_id,product_name
0,1,"Bulgarian Yogurt, Organic Celery Hearts, Cucum..."
1,2,"Michigan Organic Kale, Garlic Powder, Coconut ..."
2,3,"Organic Ginger Root, Air Chilled Organic Bonel..."
3,4,"Goldfish Cheddar Baked Snack Crackers, Sugarfr..."
4,5,"American Slices Cheese, Artichokes, 2% Reduced..."


In [10]:
transactions = order_products_df['product_name'].str.split(', ')
transactions.head()

0    [Bulgarian Yogurt, Organic Celery Hearts, Cucu...
1    [Michigan Organic Kale, Garlic Powder, Coconut...
2    [Organic Ginger Root, Air Chilled Organic Bone...
3    [Goldfish Cheddar Baked Snack Crackers, Sugarf...
4    [American Slices Cheese, Artichokes, 2% Reduce...
Name: product_name, dtype: object

In [11]:
unique_items = set(item for transaction in transactions for item in transaction)
print("Unique items -->", len(unique_items))

Unique items --> 36038


In [12]:
def get_support(itemset, transactions):
    count = transactions.apply(lambda transaction: itemset.issubset(transaction)).sum()
    return count / len(transactions)

def get_confidence(itemset, antecedent, transactions):
    support_antecedent = get_support(antecedent, transactions)
    confidence = itemset["support"] / support_antecedent
    return confidence, support_antecedent

#### Initialize support and confidence thresholds

In [13]:
min_support = 0.005
min_confidence = 0.0075

# Frequent Itemsets

In [14]:
def frequent_items(unique_items, k=1):
    frequent_itemsets = []
    while unique_items:
        print(f"Unique items for k={k} --> ", len(unique_items))
        print(f"Frequent Items generation started for k={k}")
        for item_pair in tqdm(list(combinations(unique_items, k))):
            support = get_support(set(item_pair), transactions)
            if support >= min_support:
                frequent_itemsets.append({"itemset": item_pair, "support": support, "k": k})
        
        unique_items = set()
        print("Unique Items filtering started...")
        for item_set in tqdm(frequent_itemsets):
            if item_set["k"] == k:
                for item in item_set["itemset"]:
                    unique_items.add(item)
        print("\n")
        
        k += 1
    return frequent_itemsets

In [15]:
frequent_itemsets = frequent_items(unique_items, k=1)
frequent_itemsets_df = pd.DataFrame(frequent_itemsets)
frequent_itemsets_df.head()

Unique items for k=1 -->  36038
Frequent Items generation started for k=1


100%|██████████| 36038/36038 [12:51<00:00, 46.71it/s]


Unique Items filtering started...


100%|██████████| 261/261 [00:00<00:00, 2505064.86it/s]




Unique items for k=2 -->  261
Frequent Items generation started for k=2


100%|██████████| 33930/33930 [12:42<00:00, 44.48it/s]


Unique Items filtering started...


100%|██████████| 345/345 [00:00<00:00, 4493897.14it/s]




Unique items for k=3 -->  41
Frequent Items generation started for k=3


100%|██████████| 10660/10660 [04:28<00:00, 39.64it/s]


Unique Items filtering started...


100%|██████████| 346/346 [00:00<00:00, 4319134.48it/s]




Unique items for k=4 -->  3
Frequent Items generation started for k=4


0it [00:00, ?it/s]


Unique Items filtering started...


100%|██████████| 346/346 [00:00<00:00, 5239094.53it/s]








Unnamed: 0,itemset,support,k
0,"(No Salt Added Black Beans,)",0.008821,1
1,"(Unsweetened Vanilla Almond Milk,)",0.008228,1
2,"(Crackers,)",0.005652,1
3,"(Roasted Turkey Breast,)",0.005448,1
4,"(Unsalted Butter,)",0.011315,1


# Association Rules

In [16]:
def generate_association_rules(frequent_itemsets, min_confidence, transactions):
    rules = []
    print("Association rules generation started...")
    for itemset in tqdm(frequent_itemsets):
        for size in range(1, len(itemset["itemset"])):
            for subset in combinations(itemset["itemset"], size):
                antecedent = set(subset)
                consequent = set(itemset["itemset"]) - antecedent
                confidence, support_antecedent = get_confidence(itemset, antecedent, transactions)
                if confidence >= min_confidence:
                    rules.append({"antecedent": antecedent, "consequent": consequent, "support_A": support_antecedent, "support_itemset": itemset["support"], "confidence": confidence})
    return rules


In [17]:
rules = generate_association_rules(frequent_itemsets, min_confidence, transactions)
print("association Rules -->", len(rules))

Association rules generation started...


100%|██████████| 346/346 [00:03<00:00, 96.71it/s]  

association Rules --> 174





In [18]:
association_rules_df = pd.DataFrame(rules)
association_rules_df.head()

Unnamed: 0,antecedent,consequent,support_A,support_itemset,confidence
0,{Clementines},{Bag},0.021864,0.012429,0.56849
1,{Bag},{Clementines},0.01432,0.012429,0.867951
2,{Organic Garlic},{Organic Baby Spinach},0.033629,0.006521,0.193921
3,{Organic Baby Spinach},{Organic Garlic},0.075322,0.006521,0.086579
4,{Organic Garlic},{Organic Strawberries},0.033629,0.005101,0.151672
