###Association Rules - Apriori

#Author: Prof. Sandro Camargo <github.com/sandrocamargo>

Data Mining Course https://moodle.unipampa.edu.br/moodle/course/view.php?id=5213

This script uses the basic concepts of association rules.

In this script, we used the groceries dataset: https://www.kaggle.com/datasets/heeraldedhia/groceries-dataset?select=Groceries_dataset.csv

To open this notebook in your Google Colab environment, [click here](https://colab.research.google.com/github/Sandrocamargo/data-mining/blob/main/Python/md08_apriori.ipynb).

In [None]:
!pip install apyori

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from apyori import apriori
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
!curl -L -o groceries-dataset.zip https://www.kaggle.com/api/v1/datasets/download/heeraldedhia/groceries-dataset
!echo A | unzip groceries-dataset.zip

In [None]:
# Import Data from CSV file
dataset = pd.read_csv('Groceries_dataset.csv')

In [None]:
# View the dataset
dataset.head(10)

In [None]:
# Create a unique transaction ID (e.g., by combining member number and date)
dataset['Transaction'] = dataset['Member_number'].astype(str) + '_' + dataset['Date']

print(dataset)

# Group items by transaction
transactions = dataset.groupby('Transaction')['itemDescription'].apply(list).tolist()
print(transactions)

In [None]:
# Apply Apriori algorithm
results = list(apriori(transactions, min_support=0.01, min_confidence=0.01, min_lift=0, min_length = 2))

# Min_support  = 3(3 times a day) * 7 (7 days a week) / 9835 = 0.0022
# Min_confidence = set it lower to get more relations between products (weak relations), if we set it high then
# we might miss some. I have selected confidence of 0.20
# Min_lift = In order to get some relevant rules, I am setting min_lift to 3.

In [None]:
# Extracting useful metrics from rules
rules_list = []
for rule in results:
    for stat in rule.ordered_statistics:
        if len(stat.items_base) > 0:
            rules_list.append({
                'base': ', '.join(stat.items_base),
                'add': ', '.join(stat.items_add),
                'support': rule.support,
                'confidence': stat.confidence,
                'lift': stat.lift
            })

rules_df = pd.DataFrame(rules_list)
print(rules_df)

In [None]:
# See the items that were bought together with their support
#results_list = []
#for i in range(0, len(results)):
#    results_list.append('RULE:' + str(results[i][0]) + '\nSUP:' + str(results[i][1]) + '\nCONF:' + str(results[i][2]) + '\n\n')

In [None]:
#print("The list of rules generated by the Apriori algorithm are:")
#for i in range(0, len(results_list)):
#    print(results_list[i])

In [None]:
top_lift = rules_df.sort_values(by='lift', ascending=False).head(15)

plt.figure(figsize=(12, 6))
plt.barh(range(len(top_lift)), top_lift['lift'], color='skyblue')
plt.yticks(range(len(top_lift)), [f"{a} => {b}" for a, b in zip(top_lift['base'], top_lift['add'])])
plt.xlabel('Lift')
plt.title('Top 10 Association Rules by Lift')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(rules_df['support'], rules_df['confidence'], alpha=0.7, c=rules_df['lift'], cmap='viridis')
plt.colorbar(label='Lift')
plt.title('Support vs Confidence')
plt.xlabel('Support')
plt.ylabel('Confidence')
plt.grid(True)
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(data=rules_df, x='support', y='lift', size='confidence', hue='confidence', palette='cool', legend='brief', sizes=(50, 300))
plt.title('Support vs Lift (size and color = Confidence)')
plt.xlabel('Support')
plt.ylabel('Lift')
plt.grid(True)
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(data=rules_df, x='confidence', y='lift', size='support', hue='support', palette='viridis', sizes=(50, 300))
plt.title('Confidence vs Lift (size and color = Support)')
plt.xlabel('Confidence')
plt.ylabel('Lift')
plt.grid(True)
plt.show()

In [None]:
pivot = rules_df.pivot(index='base', columns='add', values='lift')

plt.figure(figsize=(12, 8))
sns.heatmap(pivot, annot=True, fmt=".2f", cmap='YlGnBu')
plt.title('Lift Heatmap of Association Rules')
plt.xlabel('Consequent (add)')
plt.ylabel('Antecedent (base)')
plt.tight_layout()
plt.show()


In [None]:
from pandas.plotting import parallel_coordinates

rules_df_top = rules_df[['support', 'confidence', 'lift']].copy()
rules_df_top['rule'] = [f"{a} => {b}" for a, b in zip(rules_df['base'], rules_df['add'])]
rules_df_top = rules_df_top.sort_values(by='lift', ascending=False).head(10)

plt.figure(figsize=(12, 6))
parallel_coordinates(rules_df_top, 'rule', colormap='viridis')
plt.title('Parallel Coordinates Plot of Top Rules')
plt.ylabel('Metric Value')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

ax.scatter(rules_df['support'], rules_df['confidence'], rules_df['lift'],
           c=rules_df['lift'], cmap='plasma', s=100, alpha=0.7)

ax.set_xlabel('Support')
ax.set_ylabel('Confidence')
ax.set_zlabel('Lift')
ax.set_title('3D Scatter Plot of Association Rules')
plt.show()
