In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

! pip install mlxtend


[0mCollecting mlxtend
  Obtaining dependency information for mlxtend from https://files.pythonhosted.org/packages/73/da/d5d77a9a7a135c948dbf8d3b873655b105a152d69e590150c83d23c3d070/mlxtend-0.23.0-py3-none-any.whl.metadata
  Downloading mlxtend-0.23.0-py3-none-any.whl.metadata (7.3 kB)
Downloading mlxtend-0.23.0-py3-none-any.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m80.6 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25hInstalling collected packages: mlxtend
Successfully installed mlxtend-0.23.0


# Association Rule for Store Dataset

In this case study, we will explore how association rule can be used to analyze the items that are usualy purcased together.

you can refer to this article to find out about apriori and association rule:
https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/
https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/

## Load Data

We will use the dataset of the transaction in a certain store. You can get the dataset here: 
https://gist.githubusercontent.com/Harsh-Git-Hub/2979ec48043928ad9033d8469928e751/raw/72de943e040b8bd0d087624b154d41b2ba9d9b60/retail_dataset.csv

In [4]:
# load the data set and show the first five transaction
df = pd.read_csv('https://gist.githubusercontent.com/Harsh-Git-Hub/2979ec48043928ad9033d8469928e751/raw/72de943e040b8bd0d087624b154d41b2ba9d9b60/retail_dataset.csv')
df.head()

Unnamed: 0,0,1,2,3,4,5,6
0,Bread,Wine,Eggs,Meat,Cheese,Pencil,Diaper
1,Bread,Cheese,Meat,Diaper,Wine,Milk,Pencil
2,Cheese,Meat,Eggs,Milk,Wine,,
3,Cheese,Meat,Eggs,Milk,Wine,,
4,Meat,Pencil,Wine,,,,


# Get the set of product that has been purchased


Get the unique product that has been purchased

In [5]:
products = set(df.values.flatten())
print(products)


{'Pencil', 'Bagel', 'Eggs', 'Cheese', 'Wine', 'Bread', 'Diaper', nan, 'Meat', 'Milk'}


## Preprocess Data

In this step, we will transform our dataset so that we will have a one hot encoding based on the purchased products.

In [6]:
itemset = set()
for i in range(len(df)):
    itemset.update(df.loc[i].dropna().values)
itemset = list(itemset)
print(itemset)


['Pencil', 'Bagel', 'Eggs', 'Cheese', 'Wine', 'Bread', 'Diaper', 'Meat', 'Milk']


In [7]:
#create an itemset based on the products
itemset = set()
for i in range(len(df)):
    itemset.update(df.loc[i].dropna().values)
itemset = list(itemset)
print(itemset)

# encoding the feature
encoded_vals = []
for index, row in df.iterrows():
    labels = {}
    uncommons = list(set(itemset) - set(row))
    commons = list(set(itemset).intersection(row))
    for uc in uncommons:
        labels[uc] = 0
    for com in commons:
        labels[com] = 1
    encoded_vals.append(labels)
encoded_vals[0]


['Pencil', 'Bagel', 'Eggs', 'Cheese', 'Wine', 'Bread', 'Diaper', 'Meat', 'Milk']


{'Bagel': 0,
 'Milk': 0,
 'Pencil': 1,
 'Eggs': 1,
 'Cheese': 1,
 'Wine': 1,
 'Bread': 1,
 'Diaper': 1,
 'Meat': 1}

In [8]:
# create new dataframe from the encoded features
ohe_df = pd.DataFrame(encoded_vals)
# show the new dataframe
ohe_df.head()


Unnamed: 0,Bagel,Milk,Pencil,Eggs,Cheese,Wine,Bread,Diaper,Meat
0,0,0,1,1,1,1,1,1,1
1,0,1,1,0,1,1,1,1,1
2,0,1,0,1,1,1,0,0,1
3,0,1,0,1,1,1,0,0,1
4,0,0,1,0,0,1,0,0,1


Since, the encoded dataframe consist of the empty column. We will drop the NaN column or select all columns other than the first column.

## Apriori Algorithm

We will use appriori algorithm to determine the frequently purchased products. 
For this case study, we will min_support=0.2

In [9]:
from mlxtend.frequent_patterns import apriori

# min_support=0.2
freq_items = apriori(ohe_df, min_support=0.2, use_colnames=True)
freq_items




Unnamed: 0,support,itemsets
0,0.425397,(Bagel)
1,0.501587,(Milk)
2,0.361905,(Pencil)
3,0.438095,(Eggs)
4,0.501587,(Cheese)
5,0.438095,(Wine)
6,0.504762,(Bread)
7,0.406349,(Diaper)
8,0.47619,(Meat)
9,0.225397,"(Bagel, Milk)"


Then, we will generate association rule of the frequent itemset based on confidence level with the threshold=0.6

In [10]:
from mlxtend.frequent_patterns import association_rules
rules = association_rules(freq_items, metric="confidence", min_threshold=0.6)
rules.drop(['zhangs_metric'], axis=1)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Bagel),(Bread),0.425397,0.504762,0.279365,0.656716,1.301042,0.064641,1.44265
1,(Cheese),(Milk),0.501587,0.501587,0.304762,0.607595,1.211344,0.053172,1.270148
2,(Milk),(Cheese),0.501587,0.501587,0.304762,0.607595,1.211344,0.053172,1.270148
3,(Eggs),(Cheese),0.438095,0.501587,0.298413,0.681159,1.358008,0.07867,1.563203
4,(Eggs),(Meat),0.438095,0.47619,0.266667,0.608696,1.278261,0.05805,1.338624
5,(Wine),(Cheese),0.438095,0.501587,0.269841,0.615942,1.227986,0.050098,1.297754
6,(Cheese),(Meat),0.501587,0.47619,0.32381,0.64557,1.355696,0.084958,1.477891
7,(Meat),(Cheese),0.47619,0.501587,0.32381,0.68,1.355696,0.084958,1.55754
8,"(Cheese, Meat)",(Milk),0.32381,0.501587,0.203175,0.627451,1.250931,0.040756,1.337845
9,"(Cheese, Milk)",(Meat),0.304762,0.47619,0.203175,0.666667,1.4,0.05805,1.571429


Provide explanation about __antecedent support__, __consequent support__, __support__, __confidence__, __lift__, __leverage__ and __conviction__

Explanation of different measures used in association rule mining:

Antecedent Support refers to the frequency of occurrences of the item or set of items on the left-hand side of a rule within the dataset's transactions.

Consequent Support represents the frequency of occurrences of the item or set of items on the right-hand side of a rule within the dataset's transactions.

Support denotes the frequency of transactions in the dataset that contain both the antecedent and consequent, essentially representing the probability of both items occurring together.

Confidence measures the reliability of a rule by determining the proportion of transactions containing the antecedent that also contain the consequent.

Lift signifies the ratio between the observed support and the expected support if the antecedent and consequent were independent. A value greater than 1 suggests a higher likelihood of the items being bought together than if they were purchased independently.

Leverage computes the difference between the observed frequency of both antecedent and consequent appearing together and what would be expected if they were independent. A value of 0 implies independence.

Conviction indicates the strength of the implication within a rule. A high conviction value implies a strong dependency of the consequent on the antecedent, suggesting that the association between the items is not likely due to random chance. For example, a high conviction in the rule {Meat, Milk} -> {Cheese} would suggest that the relationship between Meat, Milk, and Cheese isn't likely due to random occurrences.
