# Import Libraries

In [1]:
import pandas as pd
from itertools import combinations

# Load the data

In [2]:
base = "./data/"

order_products_train_df = pd.read_csv(base + "order_products__train.csv")
order_products_prior_df = pd.read_csv(base + "order_products__prior.csv")
products_df = pd.read_csv(base + "products.csv")

#### Explore the data

In [3]:
order_products_train_df = order_products_train_df[["order_id", "product_id"]]
order_products_train_df.head()

Unnamed: 0,order_id,product_id
0,1,49302
1,1,11109
2,1,10246
3,1,49683
4,1,43633


In [4]:
order_products_prior_df = order_products_prior_df[["order_id", "product_id"]]
order_products_prior_df.head()

Unnamed: 0,order_id,product_id
0,2,33120
1,2,28985
2,2,9327
3,2,45918
4,2,30035


# Preprocess the data

In [5]:
order_products_df = pd.concat([order_products_train_df, order_products_prior_df])
order_products_df = order_products_df[order_products_df["order_id"] <= 10000]
order_products_df.sort_values("order_id", inplace=True, ignore_index=True)
order_products_df

Unnamed: 0,order_id,product_id
0,1,49302
1,1,11109
2,1,10246
3,1,49683
4,1,43633
...,...,...
98486,10000,10017
98487,10000,10369
98488,10000,6587
98489,10000,35108


In [6]:
products_in_transactions = order_products_df.groupby("product_id").count().rename(columns={"order_id": "no_of_transactions"}).reset_index()
products_in_transactions.sort_values("no_of_transactions", ascending=False).reset_index(drop=True)

Unnamed: 0,product_id,no_of_transactions
0,24852,1462
1,13176,1174
2,21137,789
3,21903,717
4,47209,637
...,...,...
16250,28973,1
16251,28969,1
16252,13751,1
16253,28961,1


#### merge orders and products

In [7]:
order_products_df = order_products_df.merge(products_df[['product_id', 'product_name']], on='product_id')
order_products_df.sort_values("order_id", inplace=True, ignore_index=True)

In [8]:
order_products_df.head()

Unnamed: 0,order_id,product_id,product_name
0,1,49302,Bulgarian Yogurt
1,1,47209,Organic Hass Avocado
2,1,22035,Organic Whole String Cheese
3,1,49683,Cucumber Kirby
4,1,13176,Bag of Organic Bananas


#### Aggregate orders

In [9]:
order_products_df = order_products_df.groupby("order_id")['product_name'].agg(lambda x: ', '.join(x)).reset_index()
order_products_df.head()

Unnamed: 0,order_id,product_name
0,1,"Bulgarian Yogurt, Organic Hass Avocado, Organi..."
1,2,"Organic Egg Whites, Michigan Organic Kale, Cla..."
2,3,"Organic Baby Spinach, Total 2% with Strawberry..."
3,4,"Plain Pre-Sliced Bagels, Kellogg's Nutri-Grain..."
4,5,"Artichokes, Dairy Milk Fruit & Nut Chocolate B..."


In [10]:
transactions = order_products_df['product_name'].str.split(', ')
transactions.head()

0    [Bulgarian Yogurt, Organic Hass Avocado, Organ...
1    [Organic Egg Whites, Michigan Organic Kale, Cl...
2    [Organic Baby Spinach, Total 2% with Strawberr...
3    [Plain Pre-Sliced Bagels, Kellogg's Nutri-Grai...
4    [Artichokes, Dairy Milk Fruit & Nut Chocolate ...
Name: product_name, dtype: object

In [11]:
unique_items = set(item for transaction in transactions for item in transaction)
print("Unique items -->", len(unique_items))

Unique items --> 16802


In [12]:
def get_support(itemset, transactions):
    count = transactions.apply(lambda transaction: itemset.issubset(transaction)).sum()
    return count / len(transactions)

def get_confidence(itemset, antecedent, transactions):
    support_antecedent = get_support(antecedent, transactions)
    confidence = itemset["support"] / support_antecedent
    return confidence, support_antecedent

#### Initialize support and confidence thresholds

In [13]:
min_support = 0.005
min_confidence = 0.005

# Frequent Itemsets

In [14]:
def frequent_items(unique_items, k=1):
    frequent_itemsets = []
    while unique_items:
        print(f"Unique items for k={k} --> ", len(unique_items))
        for item_pair in combinations(unique_items, k):
            support = get_support(set(item_pair), transactions)
            if support >= min_support:
                frequent_itemsets.append({"itemset": item_pair, "support": support, "k": k})
        
        unique_items = set()
        for item_set in frequent_itemsets:
            if item_set["k"] == k:
                for item in item_set["itemset"]:
                    unique_items.add(item)
        
        k += 1
    return frequent_itemsets

In [15]:
frequent_itemsets = frequent_items(unique_items, k=1)
frequent_itemsets_df = pd.DataFrame(frequent_itemsets)
frequent_itemsets_df.head()

Unique items for k=1 -->  16802
Unique items for k=2 -->  266
Unique items for k=3 -->  48
Unique items for k=4 -->  6


Unnamed: 0,itemset,support,k
0,"(Organic Gala Apples,)",0.022827,1
1,"(Oven Roasted Turkey Breast,)",0.006726,1
2,"(Small Hass Avocado,)",0.015693,1
3,"(Diced Tomatoes,)",0.005299,1
4,"(Broccoli Crown,)",0.011923,1


# Association Rules

In [16]:
def generate_association_rules(frequent_itemsets, min_confidence, transactions):
    rules = []
    for itemset in frequent_itemsets:
        for size in range(1, len(itemset["itemset"])):
            for subset in combinations(itemset["itemset"], size):
                antecedent = set(subset)
                consequent = set(itemset["itemset"]) - antecedent
                confidence, support_antecedent = get_confidence(itemset, antecedent, transactions)
                if confidence >= min_confidence:
                    rules.append({"antecedent": antecedent, "consequent": consequent, "support_A": support_antecedent, "support_itemset": itemset["support"], "confidence": confidence})
    return rules


In [17]:
rules = generate_association_rules(frequent_itemsets, min_confidence, transactions)
print("association Rules -->", len(rules))

association Rules --> 178


In [19]:
association_rules_df = pd.DataFrame(rules)
association_rules_df.head()

Unnamed: 0,antecedent,consequent,support_A,support_itemset,confidence
0,{Organic Large Extra Fancy Fuji Apple},{Bag of Organic Bananas},0.022419,0.006318,0.281818
1,{Bag of Organic Bananas},{Organic Large Extra Fancy Fuji Apple},0.119637,0.006318,0.052811
2,{Organic Red Radish},{Bunch},0.009579,0.009579,1.0
3,{Bunch},{Organic Red Radish},0.015388,0.009579,0.622517
4,{Organic Baby Carrots},{Banana},0.023438,0.005299,0.226087
