In [12]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder


In [13]:
online=pd.read_excel("Online retail.xlsx")

In [14]:
online.head()

Unnamed: 0,Transaction
0,"shrimp,almonds,avocado,vegetables mix,green gr..."
1,"burgers,meatballs,eggs"
2,chutney
3,"turkey,avocado"
4,"mineral water,milk,energy bar,whole wheat rice..."


In [15]:
df=online.copy()

# Split the transactions into lists
df['Transaction'] = df['Transaction'].apply(lambda x: x.split(','))

# Create a list of unique items
items = sorted(set(item for sublist in df['Transaction'] for item in sublist))

# Create a one-hot encoded DataFrame
encoded_df = pd.DataFrame([[1 if item in transaction else 0 for item in items] for transaction in df['Transaction']], columns=items)


In [16]:
encoded_df

Unnamed: 0,asparagus,almonds,antioxydant juice,asparagus.1,avocado,babies food,bacon,barbecue sauce,black tea,blueberries,...,turkey,vegetables mix,water spray,white wine,whole weat flour,whole wheat pasta,whole wheat rice,yams,yogurt cake,zucchini
0,0,1,1,0,1,0,0,0,0,0,...,0,1,0,0,1,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7496,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7497,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7498,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7499,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
encoded_df = encoded_df.astype(bool)


# Generate frequent itemsets

frequent_itemsets = apriori(encoded_df, min_support=0.02, use_colnames=True)

frequent_itemsets


Unnamed: 0,support,itemsets
0,0.020397,(almonds)
1,0.033329,(avocado)
2,0.033729,(brownies)
3,0.087188,(burgers)
4,0.030129,(butter)
...,...,...
98,0.020131,"(mineral water, whole wheat rice)"
99,0.022930,"(spaghetti, olive oil)"
100,0.025197,"(pancakes, spaghetti)"
101,0.021197,"(spaghetti, shrimp)"


In [26]:
# Generate association rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=0.7,num_itemsets=len(frequent_itemsets))

# Display the results
print(rules)

        antecedents     consequents  antecedent support  consequent support  \
0         (burgers)          (eggs)            0.087188            0.179709   
1            (eggs)       (burgers)            0.179709            0.087188   
2    (french fries)       (burgers)            0.170911            0.087188   
3         (burgers)  (french fries)            0.087188            0.170911   
4   (mineral water)       (burgers)            0.238368            0.087188   
..              ...             ...                 ...                 ...   
95      (spaghetti)      (pancakes)            0.174110            0.095054   
96      (spaghetti)        (shrimp)            0.174110            0.071457   
97         (shrimp)     (spaghetti)            0.071457            0.174110   
98      (spaghetti)      (tomatoes)            0.174110            0.068391   
99       (tomatoes)     (spaghetti)            0.068391            0.174110   

     support  confidence      lift  representativit

In [27]:
rules.sort_values('lift',ascending=False)[0:20]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
70,(spaghetti),(ground beef),0.17411,0.098254,0.039195,0.225115,2.291162,1.0,0.022088,1.163716,0.682343,0.168096,0.140684,0.312015
71,(ground beef),(spaghetti),0.098254,0.17411,0.039195,0.398915,2.291162,1.0,0.022088,1.373997,0.624943,0.168096,0.272197,0.312015
93,(olive oil),(spaghetti),0.065858,0.17411,0.02293,0.348178,1.999758,1.0,0.011464,1.267048,0.535186,0.105651,0.210764,0.239939
92,(spaghetti),(olive oil),0.17411,0.065858,0.02293,0.1317,1.999758,1.0,0.011464,1.075829,0.605334,0.105651,0.070484,0.239939
85,(mineral water),(soup),0.238368,0.050527,0.023064,0.096756,1.914955,1.0,0.01102,1.051182,0.62733,0.08676,0.04869,0.27661
84,(soup),(mineral water),0.050527,0.238368,0.023064,0.456464,1.914955,1.0,0.01102,1.401255,0.503221,0.08676,0.286354,0.27661
57,(milk),(frozen vegetables),0.129583,0.095321,0.023597,0.182099,1.910382,1.0,0.011245,1.106099,0.54749,0.117219,0.095921,0.214826
56,(frozen vegetables),(milk),0.095321,0.129583,0.023597,0.247552,1.910382,1.0,0.011245,1.156781,0.526755,0.117219,0.135532,0.214826
0,(burgers),(eggs),0.087188,0.179709,0.028796,0.330275,1.83783,1.0,0.013128,1.224818,0.499424,0.120941,0.183552,0.245256
1,(eggs),(burgers),0.179709,0.087188,0.028796,0.160237,1.83783,1.0,0.013128,1.086988,0.555754,0.120941,0.080026,0.245256


Key Interpretations:

High Confidence and Lift: For example, the rule (spaghetti → ground beef) has a confidence of 0.2251 and a lift of 2.2912, indicating that ground beef is frequently bought with spaghetti and the association is strong (more than twice as likely as by chance).

Support: This shows how frequent the itemsets are in the dataset. For instance, the support for (burgers → eggs) is 0.028796, indicating that about 2.88% of transactions contain both items.

Leverage and Conviction: These metrics help further validate the strength and reliability of the rules. High leverage and high conviction values strengthen the association rule's robustness.

Profile Insights

1. Strong Associations

(spaghetti) -> (ground beef) and (ground beef) -> (spaghetti):

These rules have high lift (2.29) and moderate confidence, indicating a strong association between spaghetti and ground beef.
(olive oil) -> (spaghetti):

Confidence (0.348) and lift (1.999) suggest that customers buying olive oil are more likely to buy spaghetti.
(soup) -> (mineral water):

High confidence (0.456) and lift (1.915) show that soup buyers often purchase mineral water.

2. Frequent Itemsets with Lower Confidence

(frozen vegetables) -> (milk):
Lift (1.91) suggests an association, but confidence is moderate (0.247).
Insight: This might indicate a weaker dependency, but it’s still noteworthy for cross-selling.

3. Cross-Selling Opportunities

(mineral water) -> (olive oil) and (olive oil) -> (mineral water):
Both rules show a positive lift (1.75), indicating that promoting these items together could be effective.

4. Rare but Strong Rules

(shrimp) -> (spaghetti):
Despite lower support (0.021), the lift (1.703) and confidence (0.296) suggest a potential niche relationship.

Actionable Recommendations

Product Pairing for Promotions:

Highlight pairings like (spaghetti, ground beef), (olive oil, spaghetti), or (mineral water, soup) in bundles or discounts.
Cross-Selling Opportunities:

Encourage customers purchasing olive oil to consider buying mineral water or spaghetti.
Suggest milk to customers buying frozen vegetables.

Customer Segmentation:

Use rules like (shrimp -> spaghetti) to target niche customer segments with specific recipes or cuisines.
Shelf Placement:

Place items with strong associations (e.g., spaghetti and ground beef) near each other to drive sales.
Marketing Campaigns:

Create campaigns around frequent pairings with strong associations, such as "Buy Olive Oil, Get 10% Off Spaghetti."


1. What is Lift, and Why is it Important in Association Rules?

Definition:

Lift measures the strength of association between items by comparing the observed co-occurrence of items to their expected co-occurrence under independence.

It is calculated as:

$
\text{Lift} = \frac{\text{Support (Antecedent ∪ Consequent)}}{\text{Support (Antecedent)} \times \text{Support (Consequent)}}
$

Importance:

Lift > 1: Indicates a positive association (items co-occur more often than expected by chance).

Lift = 1: Indicates no association (items occur together purely by chance).

Lift < 1: Indicates a negative association (items occur together less often than expected).​
 
Prioritization: Helps identify strong relationships to focus on for business decisions.

Actionable Insights: Lift reveals the true strength of association beyond mere frequency or confidence.

Avoiding Bias: Corrects for the popularity of items. For example, high-confidence rules might be trivial if the consequent is a frequently purchased item.

2. What is Support and Confidence? How Do You Calculate Them?

Definition: Support measures how frequently an itemset appears in the dataset.

Formula:

$
\text{Support} = \frac{\text{Number of Transactions Containing Itemset  }} {\text{Total Number of Transactions}}
$

Purpose: Identifies frequent itemsets to ensure the rules are statistically significant.

Confidence:

Definition: Confidence measures the likelihood that a transaction containing the antecedent also contains the consequent.
Formula:

$
\text{Confidence} = \frac{\text{Support (Antecedent ∪ Consequent) }} {\text{Support (Antecedent)}}
$

 
Purpose: Evaluates the reliability of the rule.

Example:

For the rule (spaghetti) -> (ground beef):

If spaghetti appears in 100 transactions, and spaghetti and ground beef appear together in 40 transactions, the calculations are:


$
\text{Support} = \frac{\text{40}} {\text{Total transaction}}
$

$
\text{Confidence:} = \frac{\text{40}} {\text{100}}
$
​
 =0.4 (40%)


3. What are Some Limitations or Challenges of Association Rules Mining?

1. Scalability:
Large datasets with many items lead to an exponential number of potential itemsets and rules, making computation time-consuming.
2. Choosing the Right Parameters:
Setting min_support and min_confidence thresholds is subjective and can exclude important rules or include irrelevant ones.
3. Interpretability:
High confidence or support doesn't always mean a rule is meaningful; some rules might be trivial or redundant.
4. Lack of Causality:
Association rules show correlations, but they do not imply causation (e.g., (diapers -> beer) doesn’t mean buying diapers causes beer purchases).
5. Imbalance and Popular Items:
Rules involving popular items (e.g., bread, milk) can dominate, leading to less actionable insights for niche products.
6. Handling Rare Items:
Rare but potentially valuable patterns are often missed with high support thresholds.
7. Noise and Redundancy:
Noise in data (e.g., data entry errors, outliers) can lead to misleading or redundant rules.
8. Lack of Context:
Association rules don’t account for external factors like seasonality, promotions, or demographics.
