In [None]:
# Assignment E2
# Perform Market Basket Analysis using Apriori Algorithm 

# Details of the dataset
# The dataset has 38765 rows of the purchase orders of people from the grocery stores. These orders can be analyzed and association rules can be generated using Market Basket Analysis.

# Required libraries
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

In [19]:
#Load dataset
file_path = 'Groceries_dataset.csv'
data = pd.read_csv(file_path)

In [None]:
# Remove duplicates
data = data.drop_duplicates()

In [23]:
#Check missing values
print(data.isnull().sum())

Member_number      0
Date               0
itemDescription    0
dtype: int64


In [25]:
# Create pivot table for basket format: rows = members, columns = items
basket = data.pivot_table(index='Member_number', columns='itemDescription', aggfunc=len, fill_value=0)

# Convert quantities to binary values
basket = basket.applymap(lambda x: 1 if x > 0 else 0)

# Optional: view a sample
print(basket.head())


                                 Date                            \
itemDescription Instant food products UHT-milk abrasive cleaner   
Member_number                                                     
1000                                0        0                0   
1001                                0        0                0   
1002                                0        0                0   
1003                                0        0                0   
1004                                0        0                0   

                                                                    \
itemDescription artif. sweetener baby cosmetics bags baking powder   
Member_number                                                        
1000                           0              0    0             0   
1001                           0              0    0             0   
1002                           0              0    0             0   
1003                           0           

In [26]:
# Set minimum support threshold
min_support = 0.01

# Find frequent itemsets
frequent_itemsets = apriori(basket, min_support=min_support, use_colnames=True)

# Optional: view top itemsets
print(frequent_itemsets.head())




    support                         itemsets
0  0.015393  ((Date, Instant food products))
1  0.078502               ((Date, UHT-milk))
2  0.031042          ((Date, baking powder))
3  0.119548                   ((Date, beef))
4  0.079785                ((Date, berries))


In [27]:
# Set minimum confidence threshold
min_confidence = 0.2

# Generate rules
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence)

# Sort rules by lift
rules = rules.sort_values(by="lift", ascending=False)

# Display top 10 rules
print(rules.head(10))


                                            antecedents  \
5820  ((Date, other vegetables), (Date, sausage), (D...   
5814  ((Date, other vegetables), (Date, yogurt), (Da...   
4902                     ((Date, curd), (Date, yogurt))   
5818  ((Date, sausage), (Date, rolls/buns), (Date, w...   
5813  ((Date, yogurt), (Date, rolls/buns), (Date, wh...   
2644                 ((Date, meat), (Date, whole milk))   
5819  ((Date, other vegetables), (Date, sausage), (D...   
5774       ((Date, yogurt), (Date, whipped/sour cream))   
5193               ((Date, sausage), (Date, pip fruit))   
5831  ((Date, other vegetables), (Date, whole milk),...   

                                      consequents  antecedent support  \
5820         ((Date, yogurt), (Date, rolls/buns))            0.050282   
5814        ((Date, sausage), (Date, whole milk))            0.052335   
4902        ((Date, sausage), (Date, whole milk))            0.040277   
5818   ((Date, other vegetables), (Date, yogurt))         

In [28]:
# Save results to CSV files
frequent_itemsets.to_csv('frequent_itemsets.csv', index=False)
rules.to_csv('association_rules.csv', index=False)
