# Association Rule Mining

# Setup packages

In [1]:
import subprocess
import sys
import pandas as pd
import matplotlib.pyplot as plt

# Make sure mlxtend is installed
try:
    import mlxtend
except ImportError as e:
    subprocess.check_call([sys.executable, "-m", "pip", "install", 'mlxtend'])
    import mlxtend


from pandas import DataFrame
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import fpgrowth
from mlxtend.frequent_patterns import fpmax
from mlxtend.frequent_patterns import association_rules
from ipywidgets import interact

# Ignore warnings
import warnings
warnings.simplefilter("ignore")


Collecting mlxtend
  Downloading mlxtend-0.23.1-py3-none-any.whl.metadata (7.3 kB)
Downloading mlxtend-0.23.1-py3-none-any.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hInstalling collected packages: mlxtend
Successfully installed mlxtend-0.23.1


# Import Product Orders

- What is the structure of the data?
- How many rows are being imported?

In [2]:
# Reading data
filteredOrders = pd.read_csv('orders_product_filtered.csv')
print(filteredOrders.head())

print("\n")
print("There are " + str(filteredOrders.shape[0]) + " rows in the dataset")

   order_id  product_id
0         2       33120
1         2       28985
2         2        9327
3         2       45918
4         2       30035


There are 100000 rows in the dataset


# Import Products

- What is the structure of the data?
- How many products are there in the dataset?

In [3]:
products_df = pd.read_csv('products.csv')
products_df.drop(columns= ["aisle_id", "department_id"],inplace=True)
print(products_df.head())

print("\n")
print("There are " + str(products_df.shape[0]) + " rows in the dataset")

   product_id                                       product_name
0           1                         Chocolate Sandwich Cookies
1           2                                   All-Seasons Salt
2           3               Robust Golden Unsweetened Oolong Tea
3           4  Smart Ones Classic Favorites Mini Rigatoni Wit...
4           5                          Green Chile Anytime Sauce


There are 49688 rows in the dataset


### from the previous cells...
- note: **product_id** connects the dataframes

<div style="text-align: left;">

# Data Preperation

## Step 1
Start with order/products in simple format<br/>

|Order ID|Product ID|
| :- |----------|
|   1    |  1234    |
|   1    |  1568    |
|   1    |  2999    |
|   1    |  3201    |
|   2    |  2953    |

Target each row as array of products<br/>
|Products|
|--------|
|[1234,1568,2999,3201]|
|[2953]|
</div>

In [4]:
filteredOrders['product_id'] = filteredOrders['product_id'].apply(str)
transaction = filteredOrders.groupby('order_id')['product_id'].apply(lambda x: ','.join(x)).reset_index()

orderlist = []

for item in transaction['product_id']:
    l = item.split(',')
    orderlist.append(l)



<div style="text-align: left;">

# Data Preperation

## Step 2
Start with order/products in array format<br/>

|Products|
|--------|
|[1234,1568,2999,3201]|
|[2953]|

Target each *n*-column True/False table of orders<br/>
|product 1|product 2|product 3|product 4|...|product *n*|
|-|-|-|-|-|-|
|True|True|False|False|...|False|
|True|False|False|False|...|True|
|False|True|False|False|...|False|
|True|True|False|True|...|False|
|False|False|False|False|...|True|
</div>

In [5]:
# Use Transaction Encoder (from mlxtend)
te = TransactionEncoder()

arr = te.fit(orderlist).transform(orderlist)
df = pd.DataFrame(arr,columns=te.columns_)

print(df.shape)
print("\n")
print("There are now " + str(df.shape[1]) + " columns (ie: products) in each row!")


(8186, 15604)


There are now 15604 columns (ie: products) in each row!


# Apriori

## Step 1 - Generate Frequent Itemsets
- Itemset is 1 or more items meeting minimum support threshhold

|Support|Itemset|
|-:|-:|
|0.1103|(1234)|
|0.0108|(1234,3265)|
|0.0112|(1234,3265,8722)|

In [6]:
# APRIORI
freqitem = apriori(df, min_support=0.01, use_colnames=True)

# FPGROWTH
# freqitem = fpgrowth(df, min_support=0.01, use_colnames=True)

# FPMAX
# freqitem = fpmax(df, min_support=0.01, use_colnames=True)\

# print(freqitem.shape)
print(freqitem)

      support        itemsets
0    0.010384         (10246)
1    0.023943         (10749)
2    0.018202         (11520)
3    0.013804         (12341)
4    0.141828         (13176)
..        ...             ...
158  0.013560  (24852, 45066)
159  0.012949  (47209, 24852)
160  0.013071  (24852, 47626)
161  0.019668  (47766, 24852)
162  0.013071  (49683, 24852)

[163 rows x 2 columns]


## Find a specific product

In [9]:
# Get desired product ID
hApple = products_df[products_df['product_name']=='Honeycrisp Apple']
print(hApple['product_id'].iloc[0])
appleId = hApple['product_id'].astype(str)

# Setup a "string" version of the itemsets column
freqitem["itemset_str"] = freqitem['itemsets'].astype(str)

# Find any itemsets with desired product id
freqitem[freqitem['itemset_str'].str.contains(appleId.iloc[0])]



45066


Unnamed: 0,support,itemsets,itemset_str
106,0.034449,(45066),frozenset({'45066'})
158,0.01356,"(24852, 45066)","frozenset({'24852', '45066'})"


# Create Assocation Rules from Frequent Itemset

In [10]:
# Creating assotiation rule from frequent set
res = association_rules(freqitem,metric='confidence', min_threshold=0.1)

print(res.head())

  antecedents consequents  antecedent support  consequent support   support  \
0     (21137)     (13176)            0.098216            0.141828  0.022844   
1     (13176)     (21137)            0.141828            0.098216  0.022844   
2     (13176)     (21903)            0.141828            0.082091  0.016980   
3     (21903)     (13176)            0.082091            0.141828  0.016980   
4     (22935)     (13176)            0.040924            0.141828  0.010139   

   confidence      lift  leverage  conviction  zhangs_metric  
0    0.232587  1.639929  0.008914    1.118267       0.432718  
1    0.161068  1.639929  0.008914    1.074919       0.454708  
2    0.119724  1.458428  0.005337    1.042751       0.366279  
3    0.206845  1.458428  0.005337    1.081974       0.342442  
4    0.247761  1.746919  0.004335    1.140825       0.445808  


In [11]:
res1 = res[['antecedents','consequents','support','confidence','lift']]
res2=res1[res1['confidence']>=0.1]
detailed_ruleList = []
for p1,p2,support,conf,lift in zip(res2['antecedents'],res2['consequents'],res2['support'],res2['confidence'],res2['lift']):
    x,*karg = p1
    x = int(x)
    aName = products_df.loc[products_df['product_id']==x, "product_name"]
    x,*karg = p2
    x = int(x)
    cName = products_df.loc[products_df['product_id']==x, "product_name"]
    detailed_ruleList.append({
        'antecedents': aName.values[0],
        'consequents': cName.values[0],
        'support': support,
        'confidence': conf,
        'lift': lift
    })


finalAssociationRule_df = pd.DataFrame(detailed_ruleList)
finalAssociationRule_df

Unnamed: 0,antecedents,consequents,support,confidence,lift
0,Organic Strawberries,Bag of Organic Bananas,0.022844,0.232587,1.639929
1,Bag of Organic Bananas,Organic Strawberries,0.022844,0.161068,1.639929
2,Bag of Organic Bananas,Organic Baby Spinach,0.01698,0.119724,1.458428
3,Organic Baby Spinach,Bag of Organic Bananas,0.01698,0.206845,1.458428
4,Organic Yellow Onion,Bag of Organic Bananas,0.010139,0.247761,1.746919
5,Organic Garlic,Bag of Organic Bananas,0.010261,0.233333,1.645191
6,Organic Whole Milk,Bag of Organic Bananas,0.012949,0.212851,1.500777
7,Bag of Organic Bananas,Organic Raspberries,0.018446,0.13006,2.464522
8,Organic Raspberries,Bag of Organic Bananas,0.018446,0.349537,2.464522
9,Organic Hass Avocado,Bag of Organic Bananas,0.026264,0.310245,2.187483


In [12]:
# Visualization

# Sort by Confidence since we believe this is the most valuable metric
finalAssociationRule_df.sort_values(by='confidence',inplace=True,ascending=False)

# Get Y limits for each side so that graph is consistent
confidenceYMax = finalAssociationRule_df['confidence'].max() + 0.05
liftYMax = finalAssociationRule_df['lift'].max() + 0.25


@interact(item=[*sorted(finalAssociationRule_df['antecedents'].unique())])
def update(item='Bag of Organic Bananas'):

    cons = finalAssociationRule_df[finalAssociationRule_df['antecedents']==item]['consequents']
    conf = finalAssociationRule_df[finalAssociationRule_df['antecedents']==item]['confidence']
    lift = finalAssociationRule_df[finalAssociationRule_df['antecedents']==item]['lift']
    
    x = len(cons)

    # Adjust legend depending on # of x values
    legendX = 0.8
    if( x < 3 ):
        legendX = 1.1

    fig, confAxis = plt.subplots(figsize=(x*3, 5))

    fig.subplots_adjust(left=0.1, right=0.9) 

    confAxis.set_xlabel('Associated Items',labelpad=20, fontsize=22)
    confAxis.set_ylabel('Confidence', fontsize=22, color="orange")
    confAxis.set_ylim(0,confidenceYMax)
    confAxis.bar(cons,conf,
            width=-0.2,
            color='orange',
            align='edge',
            label="Confidence" )
    confAxis.legend(loc=[legendX,0.85])
    
    liftAxis = confAxis.twinx()
    liftAxis.set_ylabel('Lift', fontsize=22, color='purple')
    liftAxis.set_ylim(0,liftYMax)
    liftAxis.bar(cons, lift,           
            width=0.2,
            color='purple',
            align='edge',
            label="Lift" )
    liftAxis.legend(loc=[legendX,0.92])

interactive(children=(Dropdown(description='item', index=1, options=('Apple Honeycrisp Organic', 'Bag of Organ…

# Observations

## How many orders will contain any specific individual "Bread" product???

In [11]:
bread_df = products_df[products_df['product_name'].str.contains("bread")]
print("There are " + str(len(bread_df)) + " `BREAD` products!!!")
print("\n")

print(bread_df.head(20))

There are 168 `BREAD` products!!!


      product_id                                       product_name
659          660     Foldit Flatbreads Artisan Rosemary & Olive Oil
682          683   Town House Sea Salt & Olive Oil Flatbread Crisps
808          809                    Bolani Pumpkin Filled Flatbread
840          841         Thick Cut Italian 7-Herb Blend Crispbreads
1026        1027           Organic Old World Flatbreads Pizza Crust
1057        1058                Pure Butter Shortbread Scottie Dogs
1741        1742                                   Onion Crispbread
1878        1879                             Seedlander Crispbreads
1984        1985            Ready Crust Shortbread 9 Inch Pie Crust
2552        2553                           Rosemary Flatbread Kamut
2784        2785       Stone Ground Cornbread & Cornmeal Muffin Mix
2867        2868                 Pure Butter Shortbread Highlanders
2950        2951        Chocolate Sea Salt Stars Shortbread Cookies
2962        