In [7]:
import pandas as pd

# Load the dataset
df = pd.read_excel("Online retail.xlsx", header=None)

# Display the first few rows of the dataframe
print("Initial Dataset Preview:")
print(df.head())

# Convert the transactions into a list of lists
transactions = df[0].apply(lambda x: x.split(','))

# Removing any whitespace around the product names
transactions = transactions.apply(lambda x: [item.strip() for item in x])

# Convert the data into a one-hot encoded DataFrame suitable for the Apriori algorithm
from mlxtend.preprocessing import TransactionEncoder

te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)

# Display the one-hot encoded dataframe
print("One-hot Encoded DataFrame:")
print(df_encoded.head())


  and should_run_async(code)


Initial Dataset Preview:
                                                   0
0  shrimp,almonds,avocado,vegetables mix,green gr...
1                             burgers,meatballs,eggs
2                                            chutney
3                                     turkey,avocado
4  mineral water,milk,energy bar,whole wheat rice...
One-hot Encoded DataFrame:
   almonds  antioxydant juice  asparagus  avocado  babies food  bacon  \
0     True               True      False     True        False  False   
1    False              False      False    False        False  False   
2    False              False      False    False        False  False   
3    False              False      False     True        False  False   
4    False              False      False    False        False  False   

   barbecue sauce  black tea  blueberries  body spray  ...  turkey  \
0           False      False        False       False  ...   False   
1           False      False        False       Fal

In [8]:
from mlxtend.frequent_patterns import apriori, association_rules

# Apply the Apriori algorithm with a minimum support threshold
frequent_itemsets = apriori(df_encoded, min_support=0.02, use_colnames=True)

# Generate the association rules with a minimum confidence threshold
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.3)

# Display the association rules
print("Generated Association Rules:")
print(rules.head())


Generated Association Rules:
     antecedents      consequents  antecedent support  consequent support  \
0      (burgers)           (eggs)            0.087188            0.179709   
1         (cake)  (mineral water)            0.081056            0.238368   
2      (chicken)  (mineral water)            0.059992            0.238368   
3    (chocolate)  (mineral water)            0.163845            0.238368   
4  (cooking oil)  (mineral water)            0.051060            0.238368   

    support  confidence      lift  leverage  conviction  zhangs_metric  
0  0.028796    0.330275  1.837830  0.013128    1.224818       0.499424  
1  0.027463    0.338816  1.421397  0.008142    1.151921       0.322617  
2  0.022797    0.380000  1.594172  0.008497    1.228438       0.396502  
3  0.052660    0.321400  1.348332  0.013604    1.122357       0.308965  
4  0.020131    0.394256  1.653978  0.007960    1.257349       0.416672  


  and should_run_async(code)


In [9]:
# Sort rules by lift to find the most significant associations
rules = rules.sort_values(by='lift', ascending=False)

# Display the top 10 rules sorted by lift
top_rules = rules.head(10)
print("Top 10 Association Rules Sorted by Lift:")
print(top_rules)

# Example interpretation:
# Look for rules with high lift, which indicates a strong association between the products.
# For instance, if 'mineral water' and 'salmon' appear together frequently and have a high lift, these could be promoted as a bundle.


Top 10 Association Rules Sorted by Lift:
            antecedents      consequents  antecedent support  \
8         (ground beef)      (spaghetti)            0.098254   
18          (olive oil)      (spaghetti)            0.065858   
14               (soup)  (mineral water)            0.050527   
0             (burgers)           (eggs)            0.087188   
11          (olive oil)  (mineral water)            0.065858   
19           (tomatoes)      (spaghetti)            0.068391   
7         (ground beef)  (mineral water)            0.098254   
4         (cooking oil)  (mineral water)            0.051060   
2             (chicken)  (mineral water)            0.059992   
6   (frozen vegetables)  (mineral water)            0.095321   

    consequent support   support  confidence      lift  leverage  conviction  \
8             0.174110  0.039195    0.398915  2.291162  0.022088    1.373997   
18            0.174110  0.022930    0.348178  1.999758  0.011464    1.267048   
14            

  and should_run_async(code)


In [None]:
"""
Interview Questions
What is lift and why is it important in Association rules?

Lift measures the strength of an association between two products. It is calculated as the ratio of the observed support to the expected support if the two items were independent. Lift > 1 indicates a positive correlation, which is important because it shows that the occurrence of one item increases the likelihood of the other.
What is support and Confidence? How do you calculate them?

Support is the frequency with which an itemset appears in the dataset. It is calculated as the number of transactions containing the itemset divided by the total number of transactions.
Confidence is the likelihood that the consequent of a rule is present in transactions containing the antecedent. It is calculated as the ratio of the number of transactions containing both the antecedent and the consequent to the number of transactions containing the antecedent.
What are some limitations or challenges of Association rules mining?

Some challenges include the computational complexity of generating a large number of rules, difficulty in setting appropriate thresholds for support and confidence, and the potential for discovering rules that are not practically useful or are too obvious.
"""