# Task 1 - Data Preparation
For this task, you will perform the following steps:
- Load all the necessary packages for this exercise
- Load the data

In [None]:
# Import 'numpy' and 'pandas' to work with numbers and dataframes
import numpy as np
import pandas as pd

# Import 'matplotlib.pyplot' and 'seaborn' for visualizations
from matplotlib import pyplot as plt
import seaborn as sns

# Import 'apriori' and 'association_rules' from 'mlxtend' for working with association rules
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [None]:
# Load the data and take a look at it
# Note: The data needs to be in binary matrix format
df = pd.read_csv('supermarket_binarymat.csv', index_col = 'transID')
df.head()

In [None]:
# Convert the data into the Boolean data type
# Note: Converting data types to Boolean is not absolutely essential but it is computationally faster
df = df.astype(bool)
df.head()

# Task 2 - Build and Analyze Frequent Itemsets
For this task, you will perform the following steps:
- Generate collections of frequent itemsets for different minimum support threshold values
  - Analyze how the number of itemsets in these collections varies as the minimum support threshold is altered
  - Analyze how the mean support of the itemsets in these collections varies as the minimum support threshold is increased
  - Choose a single collection of itemsets on which to build rules

In [None]:
# Obtain frequent itemsets for the data using the 'apriori()' method
# Hint: Study the documentation of the 'apriori()' method
# Hint: Set the 'min_support' parameter to 0.01
# Note: Set the 'use_colnames' parameter to 'True'
frequent_itemsets1 = apriori(df, min_support = 0.01, use_colnames = True)
frequent_itemsets1

In [None]:
# Find the number of itemsets in 'frequent_itemsets1' using the 'len()' method
len(frequent_itemsets1)

In [None]:
# Obtain frequent itemsets for the data using the 'apriori()' method
# Hint: Study the documentation of the 'apriori()' method
# Hint: Set the 'min_support' parameter to 0.05
# Note: Set the 'use_colnames' parameter to 'True'
frequent_itemsets2 = apriori(df, min_support = 0.05, use_colnames = True)
frequent_itemsets2

In [None]:
# Find the number of itemsets in 'frequent_itemsets2' using the 'len()' method
len(frequent_itemsets2)

In [None]:
# Obtain frequent itemsets for the data using the 'apriori()' method
# Hint: Study the documentation of the 'apriori()' method
# Hint: Set the 'min_support' parameter to 0.1
# Note: Set the 'use_colnames' parameter to 'True'
frequent_itemsets3 = apriori(df, min_support = 0.1, use_colnames = True)
frequent_itemsets3

In [None]:
# Find the number of itemsets in 'frequent_itemsets3' using the 'len()' method
len(frequent_itemsets3)

In [None]:
# Obtain collections of itemsets from the data for different values of 'min_support' using the 'apriori()' method

# Declare the list of minimum support values to iterate over
suplist = np.arange(0.01, 1, 0.01)

# Initiate lists to store the different collections of itemsets, the number of itemsets in each collection, and the mean support of the itemsets in each collection
itemsetcollections = []
nitemsets = []
meansuplist = []

# Iterate over the list of minimum support threshold values
# Hint: The 'min_support' parameter for the 'apriori()' method should be set to the current value of minimum support threshold
for supvalue in suplist:
    tempitemsetcollection = apriori(df, min_support = supvalue, use_colnames = True)
    
    # Store the current collection of itemsets in the list 'itemsetcollections'
    itemsetcollections.append(tempitemsetcollection)
    
    # Store the number of itemsets in the current collection of itemsets in the list 'nitemsets'
    nitemsets.append(len(tempitemsetcollection))
    
    # Store the mean support of the itemsets in the current collection of itemsets in the list 'meansuplist'
    meansuplist.append(tempitemsetcollection['support'].mean())

# Create and display a data frame that shows various attributes of the collections of itemsets for different values of minimum support
itemsetsummary = pd.DataFrame(data = {'minimum support': suplist,
                                      'number of itemsets': nitemsets,
                                      'mean support': meansuplist})

# Fill missing values in the data frame with 0
# Note: If the number of itemsets in a collection is 0, then the mean support is 0 for all practical purposes
itemsetsummary.fillna(0, inplace = True)
itemsetsummary

In [None]:
# Visualize how the collections of itemsets vary as the minimum support threshold is increased
plt.figure(figsize = (16, 8))
plt.subplot(2, 2, 1)
sns.lineplot(data = itemsetsummary,
             x = itemsetsummary['minimum support'].apply(lambda x: np.round(x, 2)),
             y = 'number of itemsets',
             color = 'red')
plt.title('Number of Itemsets per Collection as a Function of Minimum Support')
plt.subplot(2, 2, 2)
sns.lineplot(data = itemsetsummary,
             x = itemsetsummary['minimum support'].apply(lambda x: np.round(x, 2)),
             y = 'mean support',
             color = 'blue')
plt.title('Mean Support of Itemsets per Collection as a Function of Minimum Support');

In [None]:
# Obtain all the collections of itemsets that have at least 10 itemsets and whose mean support per collection is at least 0.05
itemsetcollectionsfinal = []
i = -1
for supvalue in suplist:
    i = i + 1
    if (nitemsets[i] >= 10) & (meansuplist[i] >= 0.05):
        itemsetcollectionsfinal.append(itemsetcollections[i])

In [None]:
# Check the number of collections of itemsets that satisfy these requirements
len(itemsetcollectionsfinal)

In [None]:
# Select and view the first collection of itemsets that has the maximum number of itemsets
# Hint: The number of itemsets per collection decreases monotonically as the minimum support threshold is increased
itemsetcollectionfinal = itemsetcollectionsfinal[0]
itemsetcollectionfinal

# Task 3 - Build and Analyze Association Rules
For this task, you will perform the following steps:
- Explore the rules generated from the selected collection of itemsets for different performance metrics and threshold values
- Generate association rule sets from the selected collection of itemsets for different minimum lift threshold values
  - Analyze how the number of rules in these collections varies as the minimum lift threshold is increased
  - Analyze how the mean support, mean confidence and mean lift ratio of the rules in these collections vary as the minimum lift threshold is increased
  - Choose a single collection of rules from which strong rules can be extracted

In [None]:
# Generate association rules for the collection of itemsets 'itemsetcollectionfinal' using the 'association_rules()' method
# Hint: Study the documentation of the 'association_rules()' method
# Note: Keep the default values for the 'metric' and the 'min_threshold' parameters
rules1 = association_rules(itemsetcollectionfinal)
rules1.head()

In [None]:
# Find the number of rules in 'rules1' using the 'len()' method
len(rules1)

In [None]:
# Generate association rules for the collection of itemsets 'itemsetcollectionfinal' using the 'association_rules()' method
# Hint: Study the documentation of the 'association_rules()' method
# Note: Use support as the metric and a minimum threshold of 0.05
rules2 = association_rules(itemsetcollectionfinal, metric = 'support', min_threshold = 0.05)
rules2.head()

In [None]:
# Find the number of rules in 'rules2' using the 'len()' method
len(rules2)

In [None]:
# Generate association rules for the collection of itemsets 'itemsetcollectionfinal' using the 'association_rules()' method
# Hint: Study the documentation of the 'association_rules()' method
# Note: Use confidence as the metric and a minimum threshold of 0.25
rules3 = association_rules(itemsetcollectionfinal, metric = 'confidence', min_threshold = 0.25)
rules3.head()

In [None]:
# Find the number of rules in 'rules3' using the 'len()' method
len(rules3)

In [None]:
# Generate association rules for the collection of itemsets 'itemsetcollectionfinal' using the 'association_rules()' method
# Hint: Study the documentation of the 'association_rules()' method
# Note: Use lift as the metric and a minimum threshold of 1.5
rules4 = association_rules(itemsetcollectionfinal, metric = 'lift', min_threshold = 1.5)
rules4.head()

In [None]:
# Find the number of rules in 'rules4' using the 'len()' method
len(rules4)

In [None]:
# Look at the basic summary statistics of the ruleset 'rules2' using the 'describe()' method
rules2.describe().transpose()

In [None]:
# Look at the basic summary statistics of the ruleset 'rules3' using the 'describe()' method
rules3.describe().transpose()

In [None]:
# Look at the basic summary statistics of the ruleset 'rules4' using the 'describe()' method
rules4.describe().transpose()

In [None]:
# Obtain collections of rules from 'itemsetcollectionfinal' for different values of minimum lift using the 'association_rules()' method

# Declare the list of minimum lift values to iterate over
liftlist = np.arange(0, 2, 0.05)

# Initiate lists to store the different collections of rules, the number of rules in each collection, and the mean support, the mean confidence, and the mean lift ratio of the rules in each collection
rulesets = []
nrules = []
meansuplist = []
meanconflist = []
meanliftlist = []

# Iterate over the list of minimum lift threshold values
# Note: Use lift as the metric
# Hint: The 'min_threshold' parameter for the 'association_rules()' method should be set to the current value of minimum lift threshold
for liftvalue in liftlist:
    tempruleset = association_rules(itemsetcollectionfinal, metric = 'lift', min_threshold = liftvalue)
    
    # Store the current collection of rules in the list 'rulesets'
    rulesets.append(tempruleset)
    
    # Store the number of rules in the current collection of rules in the list 'nrules'
    nrules.append(len(tempruleset))
    
    # Store the mean support of the rules in the current collection of rules in the list 'meansuplist'
    meansuplist.append(tempruleset['support'].mean())
    
    # Store the mean confidence of the rules in the current collection of rules in the list 'meanconflist'
    meanconflist.append(tempruleset['confidence'].mean())
    
    # Store the mean lift ratio of the rules in the current collection of rules in the list 'meanliftlist'
    meanliftlist.append(tempruleset['lift'].mean())

# Create and display a data frame that shows various attributes of the collections of rules for different values of minimum lift
rulesetsummary = pd.DataFrame(data = {'minimum lift': liftlist,
                                      'number of rules': nrules,
                                      'mean support': meansuplist,
                                      'mean confidence': meanconflist,
                                      'mean lift': meanliftlist})

# Fill missing values in the data frame with 0
# Note: If the number of rules in a collection is 0, then the mean support, confidence, and lift are 0 for all practical purposes
rulesetsummary.fillna(0, inplace = True)
rulesetsummary

In [None]:
# Visualize how the collections of rules vary as the minimum lift threshold is increased
plt.figure(figsize = (16, 8))
plt.subplot(2, 2, 1)
sns.lineplot(data = rulesetsummary,
             x = rulesetsummary['minimum lift'].apply(lambda x: np.round(x, 2)),
             y = 'number of rules',
             color = 'red')
plt.title('Number of Rules per Collection as a Function of Minimum Lift')
plt.subplot(2, 2, 2)
sns.lineplot(data = rulesetsummary,
             x = rulesetsummary['minimum lift'].apply(lambda x: np.round(x, 2)),
             y = 'mean support',
             color = 'blue')
plt.title('Mean Support of Rules per Collection as a Function of Minimum Lift')
plt.subplot(2, 2, 3)
sns.lineplot(data = rulesetsummary,
             x = rulesetsummary['minimum lift'].apply(lambda x: np.round(x, 2)),
             y = 'mean confidence',
             color = 'green')
plt.title('Mean Confidence of Rules per Collection as a Function of Minimum Lift')
plt.subplot(2, 2, 4)
sns.lineplot(data = rulesetsummary,
             x = rulesetsummary['minimum lift'].apply(lambda x: np.round(x, 2)),
             y = 'mean lift',
             color = 'orange')
plt.title('Mean Lift Ratio of Rules per Collection as a Function of Minimum Lift')
plt.tight_layout();

In [None]:
# Obtain all the collections of rules that have at least 20 rules, whose mean support per collection is at least 0.04, whose mean confidence per collection is at least 0.2, and whose mean lift ratio per collection is at least 1.6
rulesetsfinal = []
i = -1
for liftvalue in liftlist:
    i = i + 1
    if (nrules[i] >= 20) & (meansuplist[i] >= 0.04) & (meanconflist[i] >= 0.2) & (meanliftlist[i] >= 1.6):
        rulesetsfinal.append(rulesets[i])

In [None]:
# Check the number of collections of rules that satisfy these requirements
len(rulesetsfinal)

In [None]:
# Select and view the first collection of rules that has the maximum number of rules
# Hint: The number of rules per collection decreases monotonically as the minimum lift threshold is increased
rulesetfinal = rulesetsfinal[0]
rulesetfinal

# Task 4 - Identify Strong Rules
For this task, you will perform the following steps:
- Sort the selected collection of rules by support, confidence and lift and see which rules come up at the top
- Visualize the strengths of the rules using a scatter plot
- Select rules with high support, confidence and lift

In [None]:
# Sort the collection 'rulesetfinal' by 'support' using the 'sort_values()' method
# Note: Set 'ascending' to 'False'
rulesetfinal.sort_values(by = 'support', ascending = False).head()

In [None]:
# Sort the collection 'rulesetfinal' by 'confidence' using the 'sort_values()' method
# Note: Set 'ascending' to 'False'
rulesetfinal.sort_values(by = 'confidence', ascending = False).head()

In [None]:
# Sort the collection 'rulesetfinal' by 'lift' using the 'sort_values()' method
# Note: Set 'ascending' to 'False'
rulesetfinal.sort_values(by = 'lift', ascending = False).head()

In [None]:
# Visualize the strengths of the rules in the collection 'rulesetfinal'
plt.figure(figsize = (8, 6))
sns.scatterplot(data = rulesetfinal, x = 'support', y = 'confidence', hue = 'lift', palette = 'Oranges')
plt.legend(bbox_to_anchor = (1.15, 1), title = 'Lift')
plt.grid();

In [None]:
# Target the rules that have high values of both support and confidence
plt.figure(figsize = (8, 6))
sns.scatterplot(data = rulesetfinal, x = 'support', y = 'confidence', hue = 'lift', palette = 'Oranges')
plt.vlines(x = 0.04, ymin = 0.4, ymax = 0.5, colors = 'red', linestyles = '--')
plt.hlines(y = 0.4, xmin = 0.04, xmax = 0.06, colors = 'red', linestyles = '--')
plt.vlines(x = 0.06, ymin = 0.4, ymax = 0.5, colors = 'red', linestyles = '--')
plt.hlines(y = 0.5, xmin = 0.04, xmax = 0.06, colors = 'red', linestyles = '--')
plt.legend(bbox_to_anchor = (1.15, 1), title = 'Lift')
plt.grid();

In [None]:
# Extract rules that have a support of more than 0.04 and a confidence of more than 0.4
rulesetfinal[(rulesetfinal['support'] > 0.04) & (rulesetfinal['confidence'] > 0.4)]

In [None]:
# Extract rules that have a support of more than 0.04, a confidence of more than 0.4 and a lift ratio of more than 1.75
rulesetfinal[(rulesetfinal['support'] > 0.04) & (rulesetfinal['confidence'] > 0.4) & (rulesetfinal['lift'] > 1.75)]

In [None]:
# Extract rules which contain the antecedent 'root vegetables'
rulesetfinal[rulesetfinal['antecedents'] == {'root vegetables'}]