In [99]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from mlxtend.frequent_patterns import association_rules,apriori
from mlxtend.preprocessing import TransactionEncoder
import warnings
warnings.filterwarnings('ignore')

In [101]:
data= pd.read_excel('Online retail.xlsx',header=None,names=['items'])

In [102]:
data.head()

Unnamed: 0,items
0,"shrimp,almonds,avocado,vegetables mix,green gr..."
1,"burgers,meatballs,eggs"
2,chutney
3,"turkey,avocado"
4,"mineral water,milk,energy bar,whole wheat rice..."


In [103]:
data.shape

(7501, 1)

In [104]:
data

Unnamed: 0,items
0,"shrimp,almonds,avocado,vegetables mix,green gr..."
1,"burgers,meatballs,eggs"
2,chutney
3,"turkey,avocado"
4,"mineral water,milk,energy bar,whole wheat rice..."
...,...
7496,"butter,light mayo,fresh bread"
7497,"burgers,frozen vegetables,eggs,french fries,ma..."
7498,chicken
7499,"escalope,green tea"


In [105]:
# Step 1: Splitting items in each transaction
transactions = data["items"].str.split(',',expand=True)

In [106]:
df=data.join(transactions)
df

Unnamed: 0,items,0,1,2,3,4,5,6,7,8,...,10,11,12,13,14,15,16,17,18,19
0,"shrimp,almonds,avocado,vegetables mix,green gr...",shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,...,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
1,"burgers,meatballs,eggs",burgers,meatballs,eggs,,,,,,,...,,,,,,,,,,
2,chutney,chutney,,,,,,,,,...,,,,,,,,,,
3,"turkey,avocado",turkey,avocado,,,,,,,,...,,,,,,,,,,
4,"mineral water,milk,energy bar,whole wheat rice...",mineral water,milk,energy bar,whole wheat rice,green tea,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7496,"butter,light mayo,fresh bread",butter,light mayo,fresh bread,,,,,,,...,,,,,,,,,,
7497,"burgers,frozen vegetables,eggs,french fries,ma...",burgers,frozen vegetables,eggs,french fries,magazines,green tea,,,,...,,,,,,,,,,
7498,chicken,chicken,,,,,,,,,...,,,,,,,,,,
7499,"escalope,green tea",escalope,green tea,,,,,,,,...,,,,,,,,,,


In [107]:
# drop items column.

df.drop(columns=['items'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
1,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
2,chutney,,,,,,,,,,,,,,,,,,,
3,turkey,avocado,,,,,,,,,,,,,,,,,,
4,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7496,butter,light mayo,fresh bread,,,,,,,,,,,,,,,,,
7497,burgers,frozen vegetables,eggs,french fries,magazines,green tea,,,,,,,,,,,,,,
7498,chicken,,,,,,,,,,,,,,,,,,,
7499,escalope,green tea,,,,,,,,,,,,,,,,,,


In [113]:
transactions = []
for i in range(len(df)):
    transactions.append([str(df.values[i, j]) for j in range(len(df.columns))])

In [116]:
transactions

[['shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil',
  'shrimp',
  'almonds',
  'avocado',
  'vegetables mix',
  'green grapes',
  'whole weat flour',
  'yams',
  'cottage cheese',
  'energy drink',
  'tomato juice',
  'low fat yogurt',
  'green tea',
  'honey',
  'salad',
  'mineral water',
  'salmon',
  'antioxydant juice',
  'frozen smoothie',
  'spinach',
  'olive oil'],
 ['burgers,meatballs,eggs',
  'burgers',
  'meatballs',
  'eggs',
  'None',
  'None',
  'None',
  'None',
  'None',
  'None',
  'None',
  'None',
  'None',
  'None',
  'None',
  'None',
  'None',
  'None',
  'None',
  'None',
  'None'],
 ['chutney',
  'chutney',
  'None',
  'None',
  'None',
  'None',
  'None',
  'None',
  'None',
  'None',
  'None',
  'None',
  'None',
  'None',
  'None',
  'None',
  'None',
  'None',
  'None',
  'None',
  '

In [117]:
# Step 2: One-hot encode the transactions
te = TransactionEncoder()
te

In [118]:
te_data=te.fit(transactions).transform(transactions)

In [119]:
te_data

array([[False, False,  True, ..., False, False, False],
       [False,  True, False, ..., False, False, False],
       [False,  True, False, ..., False, False, False],
       ...,
       [False,  True, False, ..., False, False, False],
       [False,  True, False, ..., False, False, False],
       [False,  True, False, ..., False, False, False]])

In [120]:
# convert it into datframe
df = pd.DataFrame(te_data, columns=te.columns_)
df.drop(columns='None',inplace=True)

In [121]:
df.head()

Unnamed: 0,asparagus,almonds,"almonds,cake,low fat yogurt","almonds,cookies","almonds,eggs","almonds,eggs,cookies","almonds,eggs,cooking oil,french fries,green tea","almonds,eggs,yogurt cake","almonds,french wine","almonds,french wine,green tea,french fries,escalope,strawberries,tomato juice,honey",...,"yams,mineral water,french fries","yams,mineral water,soup,milk,pancakes,whole wheat rice,barbecue sauce,carrots,chocolate,champagne","yams,mint",yogurt cake,"yogurt cake,candy bars","yogurt cake,energy drink","yogurt cake,honey","yogurt cake,low fat yogurt","yogurt cake,mint",zucchini
0,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [122]:
# Step 3: Apply the Apriori algorithm
# Set minimum support threshold, e.g., 0.01 (1%)
frequent_itemsets = apriori(df, min_support=0.01, use_colnames=True)
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.020397,(almonds)
1,0.033329,(avocado)
2,0.010799,(barbecue sauce)
3,0.014265,(black tea)
4,0.011465,(body spray)
...,...,...
252,0.011065,"(milk, ground beef, mineral water)"
253,0.017064,"(ground beef, spaghetti, mineral water)"
254,0.015731,"(milk, spaghetti, mineral water)"
255,0.010265,"(spaghetti, olive oil, mineral water)"


In [123]:
# Generate the association rules
# Set minimum confidence, e.g., 0.2 (20%)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.2)

In [124]:
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(avocado),(mineral water),0.033329,0.238368,0.011598,0.348000,1.459926,0.003654,1.168147,0.325896
1,(burgers),(eggs),0.087188,0.179709,0.028796,0.330275,1.837830,0.013128,1.224818,0.499424
2,(burgers),(french fries),0.087188,0.170911,0.021997,0.252294,1.476173,0.007096,1.108844,0.353384
3,(burgers),(green tea),0.087188,0.132116,0.017464,0.200306,1.516139,0.005945,1.085270,0.372947
4,(burgers),(milk),0.087188,0.129583,0.017864,0.204893,1.581175,0.006566,1.094717,0.402667
...,...,...,...,...,...,...,...,...,...,...
157,"(spaghetti, mineral water)",(milk),0.059725,0.129583,0.015731,0.263393,2.032623,0.007992,1.181657,0.540294
158,"(spaghetti, olive oil)",(mineral water),0.022930,0.238368,0.010265,0.447674,1.878079,0.004799,1.378954,0.478514
159,"(olive oil, mineral water)",(spaghetti),0.027596,0.174110,0.010265,0.371981,2.136468,0.005460,1.315071,0.547034
160,"(spaghetti, pancakes)",(mineral water),0.025197,0.238368,0.011465,0.455026,1.908923,0.005459,1.397557,0.488452


In [125]:
# Display results

print("Frequent Itemsets:\n", frequent_itemsets)


Frequent Itemsets:
       support                                 itemsets
0    0.020397                                (almonds)
1    0.033329                                (avocado)
2    0.010799                         (barbecue sauce)
3    0.014265                              (black tea)
4    0.011465                             (body spray)
..        ...                                      ...
252  0.011065       (milk, ground beef, mineral water)
253  0.017064  (ground beef, spaghetti, mineral water)
254  0.015731         (milk, spaghetti, mineral water)
255  0.010265    (spaghetti, olive oil, mineral water)
256  0.011465     (spaghetti, mineral water, pancakes)

[257 rows x 2 columns]


In [126]:
print("Association Rules:\n", rules)

Association Rules:
                     antecedents      consequents  antecedent support  \
0                     (avocado)  (mineral water)            0.033329   
1                     (burgers)           (eggs)            0.087188   
2                     (burgers)   (french fries)            0.087188   
3                     (burgers)      (green tea)            0.087188   
4                     (burgers)           (milk)            0.087188   
..                          ...              ...                 ...   
157  (spaghetti, mineral water)           (milk)            0.059725   
158      (spaghetti, olive oil)  (mineral water)            0.022930   
159  (olive oil, mineral water)      (spaghetti)            0.027596   
160       (spaghetti, pancakes)  (mineral water)            0.025197   
161   (mineral water, pancakes)      (spaghetti)            0.033729   

     consequent support   support  confidence      lift  leverage  conviction  \
0              0.238368  0.011598 

# ____________________________________________________________

# Interview Questions

# Question 1: What is lift and why is it important in Association rules?

# Answer:

## Lift is a metric used to evaluate the strength of an association between two items in the context of association rule mining. It helps determine if the occurrence of one item increases the likelihood of the occurrence of another item, beyond what would be expected by chance.
## Lift measures how much more likely two items are to occur together compared to when they occur independently.
## Importance:
### 1.Identifies strong relationships: Lift helps identify rules where the co-occurrence of items is significantly higher than expected by chance.
### 2.Helps in pruning: A lift greater than 1 indicates useful rules, whereas a lift close to 1 can be considered less interesting.
### 3.Improves model relevance: High lift can guide marketing strategies, product bundling, or personalized recommendations by highlighting strong associations.

# ___________________________________________________________

# Question 2.What is support and Confidence. How do you calculate them?

# Answer:

## Support and Confidence are fundamental metrics in association rule mining, helping measure the significance and reliability of a rule.
## Support:
### Support measures the proportion of transactions in the dataset that contain a particular item or itemset.
### In general, the support of an itemset can be calculated using the following formula.

### Support(X) = (Number of transactions containing X) / (Total number of transactions)
​



## Confidence:

### In data mining, confidence is a measure of the reliability or support for a given association rule.
### Confidence measures the likelihood that an item B is purchases given that item A is purchased. It reflects the conditional probablitiy of B given A.

### Confidence(X => Y) = (Number of transactions containing X and Y) / (Number of transactions containing X)

# _________________________________________________________

# Question 3: What are some limitations or challenges of Association rules mining?


## Answer:

## 1. Computational Complexity
### Challenge: Association rule mining, especially with large datasets, can be computationally expensive.

### Reason: The process involves scanning the dataset multiple times and evaluating all possible itemsets, which can result in exponential time complexity.

### Solution: Use efficient algorithms like Apriori or FP-growth to reduce computation by pruning non-promising itemsets early.




## 2. Lack of Context
### Challenge: Association rules often do not provide context about causality.
### Reason: A rule such as "if A, then B" does not imply that A causes B, just that they appear together frequently.
### Solution: Combine association rule mining with domain knowledge to add interpretability and actionable insights.

## 3. Overfitting
### Challenge: Mining too many rules can lead to overfitting, where the model captures spurious relationships that do not generalize well.

### Reason: If you don't set appropriate thresholds (e.g., support, confidence), the algorithm may generate a large number of rules, many of which may not be useful.

### Solution: Set minimum thresholds for support, confidence, and lift to filter out weak or irrelevant rules.

## 4. Handling Large Itemsets
### Challenge: As the number of items grows, the number of possible itemsets grows exponentially, leading to a combinatorial explosion.

### Reason: Large itemsets (i.e., itemsets with many items) can be rare and computationally expensive to evaluate.

### Solution: Use constraint-based mining or Frequent Pattern Growth (FP-growth) to efficiently handle large itemsets.



## 5. Sparsity of Data
### Challenge: In some datasets (e.g., retail or e-commerce), most items are rarely purchased together, resulting in a sparse matrix.

### Reason: Most item pairs will have low support, and finding interesting rules becomes difficult.

### Solution: Apply dimensionality reduction techniques or focus on frequent itemsets that appear with significant support.

## 6. Interpretability
### Challenge: As the number of rules grows, it becomes harder to interpret or act upon them effectively.

### Reason: A large number of rules can lead to information overload.

### Solution: Use rule pruning techniques or focus on the most reliable and useful rules based on metrics like lift, support, and confidence.

## 7. Lack of Temporal or Sequential Information
### Challenge: Association rules do not take into account the temporal or sequential relationships between items.

### Reason: Items might be related based on time, but traditional association rule mining treats itemsets as independent.

### Solution: Use sequential pattern mining or temporal association rules to consider item purchase sequences over time.

## 8. Imbalanced Data
### Challenge: Association rule mining assumes that the dataset is relatively balanced, but real-world datasets may have imbalanced occurrences of different items.

### Reason: Imbalanced datasets may lead to biased rules, where the rare item (e.g., expensive products) is not sufficiently represented.

### Solution: Use techniques like undersampling, oversampling, or balanced support to handle imbalanced data.

# ------ THE END ------