<a href="https://colab.research.google.com/github/PavanKorukonda/Association-Rule-mining/blob/main/Assosiationrules.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Step-1: Setting up environment**

In [1]:
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori, fpgrowth, association_rules
from mlxtend.preprocessing import TransactionEncoder
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

print("All packages imported successfully!")

All packages imported successfully!


# **Step-2:Load and Inspect data**

In [2]:
df=pd.read_excel('/content/Online Retail.xlsx')

In [3]:
df

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
...,...,...,...,...,...,...,...,...
541904,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,2011-12-09 12:50:00,0.85,12680.0,France
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.10,12680.0,France
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.0,France
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France


# **Step-3: Data Cleaning**

In [4]:
df.isnull().values

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    541909 non-null  object        
 1   StockCode    541909 non-null  object        
 2   Description  540455 non-null  object        
 3   Quantity     541909 non-null  int64         
 4   InvoiceDate  541909 non-null  datetime64[ns]
 5   UnitPrice    541909 non-null  float64       
 6   CustomerID   406829 non-null  float64       
 7   Country      541909 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 33.1+ MB


In [6]:
# Remove extra spaces from descriptions
df['Description'] = df['Description'].str.strip()

In [7]:
# Remove rows with missing InvoiceNo
df = df[df['InvoiceNo'].notnull()]

In [8]:
# Remove cancelled transactions (Invoice numbers starting with 'C')
df = df[~df['InvoiceNo'].astype(str).str.startswith('C')]

In [9]:
# Remove non-product items such as 'POSTAGE', 'Manual'
remove_list = ['POSTAGE', 'Manual']
df = df[~df['Description'].isin(remove_list)]

In [10]:
df = df[df['Country'] == "France"] # only France taken

In [11]:
df

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
26,536370,22728,ALARM CLOCK BAKELIKE PINK,24,2010-12-01 08:45:00,3.75,12583.0,France
27,536370,22727,ALARM CLOCK BAKELIKE RED,24,2010-12-01 08:45:00,3.75,12583.0,France
28,536370,22726,ALARM CLOCK BAKELIKE GREEN,12,2010-12-01 08:45:00,3.75,12583.0,France
29,536370,21724,PANDA AND BUNNIES STICKER SHEET,12,2010-12-01 08:45:00,0.85,12583.0,France
30,536370,21883,STARS GIFT TAPE,24,2010-12-01 08:45:00,0.65,12583.0,France
...,...,...,...,...,...,...,...,...
541904,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,2011-12-09 12:50:00,0.85,12680.0,France
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.10,12680.0,France
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.0,France
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France


# Step 4: Transform Data into Transactional Format

In [12]:
df = df[['InvoiceNo','Description']]
df.shape
from sklearn.preprocessing import OneHotEncoder
df_encoded = pd.get_dummies(df,columns =['Description'])
df_encoded.columns = df_encoded.columns.str.replace('Description_', '')
display(df_encoded.head())

Unnamed: 0,InvoiceNo,10 COLOUR SPACEBOY PEN,12 COLOURED PARTY BALLOONS,12 EGG HOUSE PAINTED WOOD,12 MESSAGE CARDS WITH ENVELOPES,12 PENCIL SMALL TUBE WOODLAND,12 PENCILS SMALL TUBE RED RETROSPOT,12 PENCILS SMALL TUBE SKULL,12 PENCILS TALL TUBE POSY,12 PENCILS TALL TUBE RED RETROSPOT,...,WRAP VINTAGE PETALS DESIGN,YELLOW COAT RACK PARIS FASHION,YELLOW GIANT GARDEN THERMOMETER,YELLOW SHARK HELICOPTER,ZINC STAR T-LIGHT HOLDER,ZINC FOLKART SLEIGH BELLS,ZINC HERB GARDEN CONTAINER,ZINC METAL HEART DECORATION,ZINC T-LIGHT HOLDER STAR LARGE,ZINC T-LIGHT HOLDER STARS SMALL
26,536370,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
27,536370,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
28,536370,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
29,536370,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
30,536370,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [13]:
# This cell is no longer needed as the basket is created in cell Elm_nDYvXwKH
basket = df_encoded.groupby('InvoiceNo').sum()
def encode_units(x):
    if x <= 0:
        return 0
    else:
        return 1

basket = basket.applymap(encode_units)
basket

Unnamed: 0_level_0,10 COLOUR SPACEBOY PEN,12 COLOURED PARTY BALLOONS,12 EGG HOUSE PAINTED WOOD,12 MESSAGE CARDS WITH ENVELOPES,12 PENCIL SMALL TUBE WOODLAND,12 PENCILS SMALL TUBE RED RETROSPOT,12 PENCILS SMALL TUBE SKULL,12 PENCILS TALL TUBE POSY,12 PENCILS TALL TUBE RED RETROSPOT,12 PENCILS TALL TUBE WOODLAND,...,WRAP VINTAGE PETALS DESIGN,YELLOW COAT RACK PARIS FASHION,YELLOW GIANT GARDEN THERMOMETER,YELLOW SHARK HELICOPTER,ZINC STAR T-LIGHT HOLDER,ZINC FOLKART SLEIGH BELLS,ZINC HERB GARDEN CONTAINER,ZINC METAL HEART DECORATION,ZINC T-LIGHT HOLDER STAR LARGE,ZINC T-LIGHT HOLDER STARS SMALL
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
536370,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
536852,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
536974,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
537065,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
537463,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
580986,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
581001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
581171,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
581279,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Step 5: Run Apriori Algorithm

In [14]:
from mlxtend.frequent_patterns import apriori

# Find frequent itemsets with min_support=0.01 (1%)
frequent_itemsets_ap = apriori(basket, min_support=0.01, use_colnames=True)
print(frequent_itemsets_ap.sort_values('support', ascending=False))

        support                                           itemsets
330    0.193211                               (RABBIT NIGHT LIGHT)
369    0.185379                    (RED TOADSTOOL LED NIGHT LIGHT)
319    0.174935                 (PLASTERS IN TIN WOODLAND ANIMALS)
314    0.172324                    (PLASTERS IN TIN CIRCUS PARADE)
399    0.161880               (ROUND SNACK BOXES SET OF4 WOODLAND)
...         ...                                                ...
29843  0.010444  (LUNCH BOX WITH CUTLERY RETROSPOT, CHILDRENS C...
29842  0.010444  (LUNCH BOX WITH CUTLERY RETROSPOT, CHILDRENS C...
29841  0.010444  (LUNCH BOX WITH CUTLERY RETROSPOT, CHILDRENS C...
29840  0.010444  (LUNCH BOX WITH CUTLERY RETROSPOT, CHILDRENS C...
10     0.010444                           (36 PENCILS TUBE SKULLS)

[29860 rows x 2 columns]


# Step 6: FP growth **Algorithm**

In [15]:
from mlxtend.frequent_patterns import fpgrowth

# Find frequent itemsets with FP-Growth (should return same as Apriori)
frequent_itemsets_fp = fpgrowth(basket, min_support=0.01, use_colnames=True)
print(frequent_itemsets_fp.sort_values('support', ascending=False))

        support                                           itemsets
410    0.193211                               (RABBIT NIGHT LIGHT)
0      0.185379                    (RED TOADSTOOL LED NIGHT LIGHT)
34     0.174935                 (PLASTERS IN TIN WOODLAND ANIMALS)
145    0.172324                    (PLASTERS IN TIN CIRCUS PARADE)
1      0.161880               (ROUND SNACK BOXES SET OF4 WOODLAND)
...         ...                                                ...
12452  0.010444  (ROUND SNACK BOXES SET OF4 WOODLAND, LUNCH BAG...
12453  0.010444  (LUNCH BOX WITH CUTLERY RETROSPOT, CHILDRENS C...
12454  0.010444  (CHILDRENS CUTLERY DOLLY GIRL, ROUND SNACK BOX...
12455  0.010444  (CHILDRENS CUTLERY DOLLY GIRL, ROUND SNACK BOX...
12444  0.010444  (LUNCH BOX WITH CUTLERY RETROSPOT, CHILDRENS C...

[29860 rows x 2 columns]


# Step 7: Generate and Interpret Assosiation Rules

In [16]:
from mlxtend.frequent_patterns import association_rules

# Association rules from Apriori results, filtering by lift > 1
rules = association_rules(frequent_itemsets_ap, metric="lift", min_threshold=1)
print(rules[['antecedents','consequents', 'support', 'confidence', 'lift']])

                            antecedents  \
0              (10 COLOUR SPACEBOY PEN)   
1                     (CARD DOLLY GIRL)   
2              (10 COLOUR SPACEBOY PEN)   
3           (CHARLOTTE BAG SUKI DESIGN)   
4              (10 COLOUR SPACEBOY PEN)   
...                                 ...   
1139923  (SKULL LUNCH BOX WITH CUTLERY)   
1139924      (ALARM CLOCK BAKELIKE RED)   
1139925        (LUNCH BAG APPLE DESIGN)   
1139926      (RED RETROSPOT MINI CASES)   
1139927       (LUNCH BOX I LOVE LONDON)   

                                               consequents   support  \
0                                        (CARD DOLLY GIRL)  0.010444   
1                                 (10 COLOUR SPACEBOY PEN)  0.010444   
2                              (CHARLOTTE BAG SUKI DESIGN)  0.010444   
3                                 (10 COLOUR SPACEBOY PEN)  0.010444   
4                             (ICE CREAM SUNDAE LIP GLOSS)  0.010444   
...                                                

In [17]:
# We'll use the frequent itemsets from FP-Growth.
# We are looking for rules with a 'lift' greater than 1.
print("\nGenerating association rules (metric='lift', min_threshold=1)...")
# We will use a common threshold for confidence as well, e.g., 0.5
rules = association_rules(frequent_itemsets_fp, metric="lift", min_threshold=1)
# Sort the rules by 'lift' to see the strongest associations first
rules_sorted = rules.sort_values('lift', ascending=False)
print("\n--- Top Association Rules (Sorted by Lift) ---")
# Displaying the columns you requested
print(rules_sorted[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head(10).to_string(index=False))

print("\n--- Interpretation Example (first rule) ---")
if not rules_sorted.empty:
    first_rule = rules_sorted.iloc[0]
    print(f"Rule: IF a customer buys {list(first_rule['antecedents'])},")
    print(f"THEN they are {first_rule['lift']:.2f} times MORE LIKELY")
    print(f"to also buy {list(first_rule['consequents'])}.")
    print(f"This rule has a confidence of {first_rule['confidence']:.0%}.")
else:
    print("No rules found with the current thresholds.")


Generating association rules (metric='lift', min_threshold=1)...

--- Top Association Rules (Sorted by Lift) ---
                                                                                                   antecedents                                                                                                                                    consequents  support  confidence  lift
                                  (JUMBO BAG APPLES, CHILDRENS CUTLERY DOLLY GIRL, ALARM CLOCK BAKELIKE GREEN)   (LUNCH BOX WITH CUTLERY RETROSPOT, SKULL LUNCH BOX WITH CUTLERY, ALARM CLOCK BAKELIKE RED, LUNCH BAG APPLE DESIGN, RED RETROSPOT MINI CASES) 0.010444         1.0 95.75
      (RED RETROSPOT MINI CASES, ALARM CLOCK BAKELIKE RED, LUNCH BAG APPLE DESIGN, ALARM CLOCK BAKELIKE GREEN)                               (JUMBO BAG APPLES, LUNCH BOX WITH CUTLERY RETROSPOT, CHILDRENS CUTLERY DOLLY GIRL, SKULL LUNCH BOX WITH CUTLERY) 0.010444         1.0 95.75
  (RED RETROSPOT MINI CASES, LUNCH BAG APPL

In [18]:
# Adjusting Parameters
# Let's try with different parameters
# We'll increase support to 0.05 (find more itemsets)
# and increase the lift threshold to 6 (find only very strong rules).

print("\n\n--- Running with Adjusted Parameters (min_support=0.05, min_lift=6) ---")

# 1. Re-run FP-Growth with new support
frequent_itemsets_adj = fpgrowth(basket, min_support=0.05, use_colnames=True)
print(f"Found {len(frequent_itemsets_adj)} frequent itemsets with min_support=0.05")

# 2. Generate rules with new lift threshold
rules_adj = association_rules(frequent_itemsets_adj, metric="lift", min_threshold=6)
print(f"Found {len(rules_adj)} rules with min_lift > 6")

# 3. Display new rules
print("\n--- Top Adjusted Rules (Sorted by Lift) ---")
print(rules_adj.sort_values('lift', ascending=False)[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head(10).to_string(index=False))

print("\n\nProcess complete.")



--- Running with Adjusted Parameters (min_support=0.05, min_lift=6) ---
Found 113 frequent itemsets with min_support=0.05
Found 22 rules with min_lift > 6

--- Top Adjusted Rules (Sorted by Lift) ---
                                            antecedents                                             consequents  support  confidence      lift
                           (PACK OF 6 SKULL PAPER CUPS)                          (PACK OF 6 SKULL PAPER PLATES) 0.052219    0.800000 13.927273
                         (PACK OF 6 SKULL PAPER PLATES)                            (PACK OF 6 SKULL PAPER CUPS) 0.052219    0.909091 13.927273
                           (CHILDRENS CUTLERY SPACEBOY)                          (CHILDRENS CUTLERY DOLLY GIRL) 0.065274    0.925926 12.665344
                         (CHILDRENS CUTLERY DOLLY GIRL)                            (CHILDRENS CUTLERY SPACEBOY) 0.065274    0.892857 12.665344
(ALARM CLOCK BAKELIKE GREEN, ALARM CLOCK BAKELIKE PINK)                            