In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.preprocessing import OneHotEncoder

!pip install mlxtend==0.23.1

Collecting mlxtend==0.23.1
  Downloading mlxtend-0.23.1-py3-none-any.whl.metadata (7.3 kB)
Downloading mlxtend-0.23.1-py3-none-any.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: mlxtend
  Attempting uninstall: mlxtend
    Found existing installation: mlxtend 0.23.3
    Uninstalling mlxtend-0.23.3:
      Successfully uninstalled mlxtend-0.23.3
Successfully installed mlxtend-0.23.1


# Association Rule for Store Dataset

In this case study, we will explore how association rule can be used to analyze the items that are usualy purcased together.

you can refer to this article to find out about apriori and association rule:
https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/
https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/

## Load Data

We will use the dataset of the transaction in a certain store. You can get the dataset here:
https://gist.githubusercontent.com/Harsh-Git-Hub/2979ec48043928ad9033d8469928e751/raw/72de943e040b8bd0d087624b154d41b2ba9d9b60/retail_dataset.csv

In [3]:
# load the data set ans show the first five transaction
data = pd.read_csv("https://gist.githubusercontent.com/Harsh-Git-Hub/2979ec48043928ad9033d8469928e751/raw/72de943e040b8bd0d087624b154d41b2ba9d9b60/retail_dataset.csv")
data.head()

Unnamed: 0,0,1,2,3,4,5,6
0,Bread,Wine,Eggs,Meat,Cheese,Pencil,Diaper
1,Bread,Cheese,Meat,Diaper,Wine,Milk,Pencil
2,Cheese,Meat,Eggs,Milk,Wine,,
3,Cheese,Meat,Eggs,Milk,Wine,,
4,Meat,Pencil,Wine,,,,


# Get the set of product that has been purchased


In [4]:
product = data.unstack().unique()
print(product)

['Bread' 'Cheese' 'Meat' 'Eggs' 'Wine' 'Bagel' 'Pencil' 'Diaper' 'Milk'
 nan]


## Preprocess Data

In this step, we will transform our dataset so that we will have a one hot encoding based on the purchased products.

In [5]:
#create an itemset based on the products
items = {item: 0 for item in product}

# encoding the feature
for item in data.iloc[0]:
    if item in items:
        items[item] = 1

items

{'Bread': 1,
 'Cheese': 1,
 'Meat': 1,
 'Eggs': 1,
 'Wine': 1,
 'Bagel': 0,
 'Pencil': 1,
 'Diaper': 1,
 'Milk': 0,
 nan: 0}

In [8]:
# create new dataframe from the encoded features
reshaped_data = data.values.tolist()

# Flatten the list of items
flat_items = [item for sublist in reshaped_data for item in sublist]

# Reshape the data to a 2D array where each item is a row
flat_items_array = np.array(flat_items).reshape(-1, 1)

# Initialize OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)

# Fit and transform the data
encoded_data = encoder.fit_transform(flat_items_array)

# Convert the encoded data to integer (1 and 0)
encoded_data = encoded_data.astype(int)

# Create a DataFrame with item names as columns
encoded_df = pd.DataFrame(encoded_data, columns=encoder.categories_[0])

# Now, create a customer ID list for the rows to map the one-hot encoding back to the original customers
customer_ids = []
for i, row in enumerate(reshaped_data):
    customer_ids.extend([i] * len(row))

# Add the customer IDs to the DataFrame
encoded_df['customer_id'] = customer_ids

# Pivot the DataFrame to get one-hot encoding by customer
final_df = encoded_df.groupby('customer_id').sum()

# create new dataframe from the encoded features
transformed_data = final_df

  # show the new dataframe
transformed_data.head()

Unnamed: 0_level_0,Bagel,Bread,Cheese,Diaper,Eggs,Meat,Milk,Pencil,Wine,nan
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0,1,1,1,1,1,0,1,1,0
1,0,1,1,1,0,1,1,1,1,0
2,0,0,1,0,1,1,1,0,1,2
3,0,0,1,0,1,1,1,0,1,2
4,0,0,0,0,0,1,0,1,1,4


In [9]:
# Since, the encoded dataframe consist of the empty column. We will drop the NaN column or u can use the index.
transformed_data = transformed_data.drop(columns='nan')
transformed_data.head()

Unnamed: 0_level_0,Bagel,Bread,Cheese,Diaper,Eggs,Meat,Milk,Pencil,Wine
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0,1,1,1,1,1,0,1,1
1,0,1,1,1,0,1,1,1,1
2,0,0,1,0,1,1,1,0,1
3,0,0,1,0,1,1,1,0,1
4,0,0,0,0,0,1,0,1,1


Since, the encoded dataframe consist of the empty column. We will drop the NaN column or select all columns other than the first column.

## Apriori Algorithm

We will use appriori algorithm to determine the frequently purchased products.
For this case study, we will min_support=0.2

In [10]:
#Set threshold value untuk digunakan dalam penghitungan support
from mlxtend.frequent_patterns import apriori, association_rules
frequent_itemsets = apriori(transformed_data, min_support=0.2, use_colnames=True)
frequent_itemsets



Unnamed: 0,support,itemsets
0,0.425397,(Bagel)
1,0.504762,(Bread)
2,0.501587,(Cheese)
3,0.406349,(Diaper)
4,0.438095,(Eggs)
5,0.47619,(Meat)
6,0.501587,(Milk)
7,0.361905,(Pencil)
8,0.438095,(Wine)
9,0.279365,"(Bread, Bagel)"


The we will generate association rule of the frequent itemset based on confidence level with the threshold=0.6

In [11]:
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.6)
rules.drop(columns=['zhangs_metric'], inplace=True)
rules

  and should_run_async(code)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Bagel),(Bread),0.425397,0.504762,0.279365,0.656716,1.301042,0.064641,1.44265
1,(Eggs),(Cheese),0.438095,0.501587,0.298413,0.681159,1.358008,0.07867,1.563203
2,(Meat),(Cheese),0.47619,0.501587,0.32381,0.68,1.355696,0.084958,1.55754
3,(Cheese),(Meat),0.501587,0.47619,0.32381,0.64557,1.355696,0.084958,1.477891
4,(Milk),(Cheese),0.501587,0.501587,0.304762,0.607595,1.211344,0.053172,1.270148
5,(Cheese),(Milk),0.501587,0.501587,0.304762,0.607595,1.211344,0.053172,1.270148
6,(Wine),(Cheese),0.438095,0.501587,0.269841,0.615942,1.227986,0.050098,1.297754
7,(Eggs),(Meat),0.438095,0.47619,0.266667,0.608696,1.278261,0.05805,1.338624
8,"(Meat, Cheese)",(Eggs),0.32381,0.438095,0.215873,0.666667,1.521739,0.074014,1.685714
9,"(Meat, Eggs)",(Cheese),0.266667,0.501587,0.215873,0.809524,1.613924,0.082116,2.616667


Provide explanation about __antecedent support__, __consequent support__, __support__, __confidence__, __lift__, __leverage__, __conviction__, __conviction__ and the interpretation from the case above (please use text section)

1. Antecedents and Consequents: These represent the "if-then" relationships. For example, in the first row, if a customer buys a Bagel, they are likely to buy Bread.
2. Support: This measures how frequently the antecedent and consequent occur together in the dataset. For instance, the support for the Bagel → Bread rule is 0.279, indicating that 27.9% of transactions include both items.
3. Confidence: This indicates the likelihood of purchasing the consequent when the antecedent is purchased. For example, the confidence of Eggs → Cheese is 68.1%, meaning that 68.1% of customers who buy Eggs also buy Cheese.
4. Lift: This evaluates how much more likely the consequent is to occur when the antecedent is present compared to random chance. For instance, the lift for Meat → Cheese is 1.356, suggesting a positive correlation where Cheese is 1.356 times more likely to be bought with Meat.
5. Leverage: This measures the difference between the observed co-occurrence of items and their expected co-occurrence under independence. For example, the leverage for Meat → Cheese is 0.085, indicating a moderate level of interdependence.
6. Conviction: This metric reflects the strength of the rule in terms of how frequently it is invalidated. For instance, the conviction of Meat → Cheese is 1.558, showing a moderately strong association.

Include strong relationships such as Meat, Eggs → Cheese, which has high confidence (80.9%) and lift (1.614), indicating a robust dependency. Similarly, Meat, Milk → Cheese exhibits high confidence (83.1%) and the highest lift (1.657), signifying that Cheese is strongly associated with these two items. Conversely, rules with lower lift and confidence, such as Milk → Cheese (confidence: 60.8%, lift: 1.211), indicate weaker associations