In [1]:
import numpy as np 
import pandas as pd 
from mlxtend.frequent_patterns import apriori, association_rules 
from matplotlib import pyplot as plt

### 1. Data Collection

In [2]:
# Loading the Data 
data = pd.read_excel('online_retail.xlsx') 
data.head()
print(f"Shape of data after removing missing values: {data.shape}")

Shape of data after removing missing values: (525461, 8)


In [3]:
# Amount of Data
data.__len__()

525461

In [4]:
# Explore the Backgorund of the Data
data.columns
print("unqiue country: ", data.Country.unique())
print("Counties: ", data.Country.unique().__len__())

unqiue country:  ['United Kingdom' 'France' 'USA' 'Belgium' 'Australia' 'EIRE' 'Germany'
 'Portugal' 'Japan' 'Denmark' 'Nigeria' 'Netherlands' 'Poland' 'Spain'
 'Channel Islands' 'Italy' 'Cyprus' 'Greece' 'Norway' 'Austria' 'Sweden'
 'United Arab Emirates' 'Finland' 'Switzerland' 'Unspecified' 'Malta'
 'Bahrain' 'RSA' 'Bermuda' 'Hong Kong' 'Singapore' 'Thailand' 'Israel'
 'Lithuania' 'West Indies' 'Lebanon' 'Korea' 'Brazil' 'Canada' 'Iceland']
Counties:  40


### 2. Data Processing

In [5]:
# Handle missing values appropriately
# Remove missing values
required_columns = ['Invoice', 'StockCode', 'Quantity', 'Description', 'Customer ID']
data_cleaned = data.dropna(subset=required_columns)
print("Data after removing missing values:")
print(data_cleaned.head())  # Display the first few rows
print(f"Shape of data after removing missing values: {data_cleaned.shape}")
print(f"Size of data after removing missing values: {len(data_cleaned)}")




Data after removing missing values:
  Invoice StockCode                          Description  Quantity  \
0  489434     85048  15CM CHRISTMAS GLASS BALL 20 LIGHTS        12   
1  489434    79323P                   PINK CHERRY LIGHTS        12   
2  489434    79323W                  WHITE CHERRY LIGHTS        12   
3  489434     22041         RECORD FRAME 7" SINGLE SIZE         48   
4  489434     21232       STRAWBERRY CERAMIC TRINKET BOX        24   

          InvoiceDate  Price  Customer ID         Country  
0 2009-12-01 07:45:00   6.95      13085.0  United Kingdom  
1 2009-12-01 07:45:00   6.75      13085.0  United Kingdom  
2 2009-12-01 07:45:00   6.75      13085.0  United Kingdom  
3 2009-12-01 07:45:00   2.10      13085.0  United Kingdom  
4 2009-12-01 07:45:00   1.25      13085.0  United Kingdom  
Shape of data after removing missing values: (417534, 8)
Size of data after removing missing values: 417534


In [6]:
# Filter out negative quantities from the DataFrame
data_cleaned = data_cleaned[data_cleaned['Quantity'] > 0]
print("Data after filtering out negative quantities:")
print(data_cleaned.head())  # Display the first few rows
print(f"Shape of data after filtering negative quantities: {data_cleaned.shape}")
print(f"Size of data after filtering negative quantities: {len(data_cleaned)}")


Data after filtering out negative quantities:
  Invoice StockCode                          Description  Quantity  \
0  489434     85048  15CM CHRISTMAS GLASS BALL 20 LIGHTS        12   
1  489434    79323P                   PINK CHERRY LIGHTS        12   
2  489434    79323W                  WHITE CHERRY LIGHTS        12   
3  489434     22041         RECORD FRAME 7" SINGLE SIZE         48   
4  489434     21232       STRAWBERRY CERAMIC TRINKET BOX        24   

          InvoiceDate  Price  Customer ID         Country  
0 2009-12-01 07:45:00   6.95      13085.0  United Kingdom  
1 2009-12-01 07:45:00   6.75      13085.0  United Kingdom  
2 2009-12-01 07:45:00   6.75      13085.0  United Kingdom  
3 2009-12-01 07:45:00   2.10      13085.0  United Kingdom  
4 2009-12-01 07:45:00   1.25      13085.0  United Kingdom  
Shape of data after filtering negative quantities: (407695, 8)
Size of data after filtering negative quantities: 407695


In [7]:
# Remove canceled invoices from the DataFrame
data_cleaned = data_cleaned[~data_cleaned['Invoice'].astype(str).str.contains('C', na=False)]
print("Data after removing canceled invoices:")
print(data_cleaned.head())  # Display the first few rows
print(f"Shape of data after removing canceled invoices: {data_cleaned.shape}")
print(f"Size of data after removing canceled invoices: {len(data_cleaned)}")

Data after removing canceled invoices:
  Invoice StockCode                          Description  Quantity  \
0  489434     85048  15CM CHRISTMAS GLASS BALL 20 LIGHTS        12   
1  489434    79323P                   PINK CHERRY LIGHTS        12   
2  489434    79323W                  WHITE CHERRY LIGHTS        12   
3  489434     22041         RECORD FRAME 7" SINGLE SIZE         48   
4  489434     21232       STRAWBERRY CERAMIC TRINKET BOX        24   

          InvoiceDate  Price  Customer ID         Country  
0 2009-12-01 07:45:00   6.95      13085.0  United Kingdom  
1 2009-12-01 07:45:00   6.75      13085.0  United Kingdom  
2 2009-12-01 07:45:00   6.75      13085.0  United Kingdom  
3 2009-12-01 07:45:00   2.10      13085.0  United Kingdom  
4 2009-12-01 07:45:00   1.25      13085.0  United Kingdom  
Shape of data after removing canceled invoices: (407695, 8)
Size of data after removing canceled invoices: 407695


### 3. Create a basket

In [8]:
# Create a basket by grouping by 'Invoice' and concatenating descriptions
basket = data_cleaned.groupby('Invoice')['Description'].apply(lambda x: ', '.join(x)).reset_index()
print("Basket created:")
print(basket.head())  # Display the first few rows of the basket
print(f"Shape of basket: {basket.shape}")

Basket created:
   Invoice                                        Description
0   489434  15CM CHRISTMAS GLASS BALL 20 LIGHTS, PINK CHER...
1   489435  CAT BOWL , DOG BOWL , CHASING BALL DESIGN, HEA...
2   489436  DOOR MAT BLACK FLOCK , LOVE BUILDING BLOCK WOR...
3   489437  CHRISTMAS CRAFT HEART DECORATIONS, CHRISTMAS C...
4   489438  DINOSAURS  WRITING SET , SET OF MEADOW  FLOWER...
Shape of basket: (19215, 2)


### 4. One-hot Encoding

In [9]:
from mlxtend.preprocessing import TransactionEncoder

# Create a list of transactions (split descriptions into lists)
transactions = basket['Description'].str.split(', ').tolist()

# Perform one-hot encoding
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df_onehot = pd.DataFrame(te_ary, columns=te.columns_)

# Display the one-hot encoded DataFrame
print("One-Hot Encoded DataFrame:")
print(df_onehot.head())

One-Hot Encoded DataFrame:
            DOORMAT UNION JACK GUNS AND ROSES   3 STRIPEY MICE FELTCRAFT  \
0  False                                False                      False   
1  False                                False                      False   
2  False                                False                      False   
3  False                                False                      False   
4  False                                False                      False   

    4 PURPLE FLOCK DINNER CANDLES   ANIMAL STICKERS  \
0                           False             False   
1                           False             False   
2                           False             False   
3                           False             False   
4                           False             False   

    BLACK PIRATE TREASURE CHEST   BROWN  PIRATE TREASURE CHEST   \
0                         False                           False   
1                         False                    

### 5. Implementation of Apriori Algorithm

In [10]:
from mlxtend.frequent_patterns import apriori, association_rules

# Apply Apriori Algorithm
frequent_itemsets = apriori(df_onehot, min_support=0.01, use_colnames=True)
print("Frequent Itemsets:")
print(frequent_itemsets.head())  # Display the first few frequent itemsets

Frequent Itemsets:
    support                         itemsets
0  0.030965                               ()
1  0.011189           ( WHITE CHERRY LIGHTS)
2  0.011501         (1 HANGER ,MAGIC GARDEN)
3  0.012855         (10 COLOUR SPACEBOY PEN)
4  0.013063  (12 PENCIL SMALL TUBE WOODLAND)


### 6. Assosiation Rules Generation

In [11]:
# Generate Association Rules
# Generate association rules from the frequent itemsets
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1, num_itemsets=len(frequent_itemsets))
print("Association Rules:")
print(rules.head())  # Display the first few association rules


Association Rules:
       antecedents                 consequents  antecedent support  \
0               ()             (BIRTHDAY CARD)            0.030965   
1  (BIRTHDAY CARD)                          ()            0.016185   
2               ()                  (ELEPHANT)            0.030965   
3       (ELEPHANT)                          ()            0.014260   
4               ()  (FANCY FONT BIRTHDAY CARD)            0.030965   

   consequent support   support  confidence       lift  representativity  \
0            0.016185  0.014676    0.473950  29.282769               1.0   
1            0.030965  0.014676    0.906752  29.282769               1.0   
2            0.014260  0.014260    0.460504  32.294118               1.0   
3            0.030965  0.014260    1.000000  32.294118               1.0   
4            0.016081  0.016081    0.519328  32.294118               1.0   

   leverage  conviction  zhangs_metric   jaccard  certainty  kulczynski  
0  0.014175    1.870191      

### 7. Function to Recommend Products

In [12]:
# just Experiment
# Function to Recommend Products
def recommend_products(product_name):
    # Check for the product name in the antecedents using a case-insensitive search
    product_rules = rules[rules['antecedents'].apply(lambda x: any(product_name.lower() == str(item).lower() for item in x))]
    
    if product_rules.empty:
        return f"No recommendations found for '{product_name}'."
    
    recommended = product_rules['consequents'].apply(lambda x: list(x)[0])
    return recommended.tolist()

# Example usage
product_to_check = 'STRAWBERRY CERAMIC TRINKET BOX'  # Ensure this matches a product in your dataset
recommended_products = recommend_products(product_to_check)

# Print the recommendations
print(f"Products recommended for '{product_to_check}': {recommended_products}")

# Function to print the rules associated with a given product
def print_recommendation_proof(product_name):
    # Find rules where the product is an antecedent
    relevant_rules = rules[rules['antecedents'].apply(lambda x: product_name in x)]
    
    if relevant_rules.empty:
        print(f"No rules found for '{product_name}'.")
        return
    
    print(f"Rules for '{product_name}':")
    for index, row in relevant_rules.iterrows():
        antecedents = ', '.join(list(row['antecedents']))
        consequents = ', '.join(list(row['consequents']))
        support = row['support']
        confidence = row['confidence']
        lift = row['lift']
        
        print(f" - If you buy: {antecedents}, then you may also buy: {consequents}. (Support: {support:.2f}, Confidence: {confidence:.2f}, Lift: {lift:.2f})")

# Print the proof for the recommendations
print_recommendation_proof(product_to_check)

Products recommended for 'STRAWBERRY CERAMIC TRINKET BOX': ['60 TEATIME FAIRY CAKE CASES', 'PINK DOUGHNUT TRINKET POT ', 'SWEETHEART CERAMIC TRINKET BOX', 'WHITE HANGING HEART T-LIGHT HOLDER']
Rules for 'STRAWBERRY CERAMIC TRINKET BOX':
 - If you buy: STRAWBERRY CERAMIC TRINKET BOX, then you may also buy: 60 TEATIME FAIRY CAKE CASES. (Support: 0.01, Confidence: 0.17, Lift: 2.83)
 - If you buy: STRAWBERRY CERAMIC TRINKET BOX, then you may also buy: PINK DOUGHNUT TRINKET POT . (Support: 0.01, Confidence: 0.15, Lift: 8.88)
 - If you buy: STRAWBERRY CERAMIC TRINKET BOX, then you may also buy: SWEETHEART CERAMIC TRINKET BOX. (Support: 0.03, Confidence: 0.47, Lift: 11.08)
 - If you buy: STRAWBERRY CERAMIC TRINKET BOX, then you may also buy: WHITE HANGING HEART T-LIGHT HOLDER. (Support: 0.02, Confidence: 0.26, Lift: 1.62)


In [13]:
# Test with Various Items
# Function to recommend products
def recommend_products(product_name):
    # Check for the product name in the antecedents using a case-insensitive search
    product_rules = rules[rules['antecedents'].apply(lambda x: any(product_name.lower() == str(item).lower() for item in x))]
    
    if product_rules.empty:
        return f"No recommendations found for '{product_name}'."
    
    recommended = product_rules['consequents'].apply(lambda x: list(x)[0])
    return recommended.tolist()

# Function to print the rules associated with a given product
def print_recommendation_proof(product_name):
    # Find rules where the product is an antecedent
    relevant_rules = rules[rules['antecedents'].apply(lambda x: product_name in x)]
    
    if relevant_rules.empty:
        print(f"No rules found for '{product_name}'.")
        return
    
    print(f"Rules for '{product_name}':")
    for index, row in relevant_rules.iterrows():
        antecedents = ', '.join(list(row['antecedents']))
        consequents = ', '.join(list(row['consequents']))
        support = row['support']
        confidence = row['confidence']
        lift = row['lift']
        
        print(f" - If you buy: {antecedents}, then you may also buy: {consequents}. (Support: {support:.2f}, Confidence: {confidence:.2f}, Lift: {lift:.2f})")

# List of products to check
test_items = [
    "WHITE HANGING HEART T-LIGHT HOLDER",
    "WHITE METAL LANTERN",
    "CREAM CUPID HEARTS COAT HANGER",
    "KNITTED UNION FLAG HOT WATER BOTTLE",
    "RED WOOLLY HOTTIE WHITE HEART.",
    "SET 7 BABUSHKA NESTING BOXES",
    "GLASS STAR FROSTED T-LIGHT HOLDER",
    "HAND WARMER UNION JACK",
    "HAND WARMER RED POLKA DOT",
    "JAM MAKING SET WITH JARS",
    "RED COAT RACK PARIS FASHION",
    "YELLOW COAT RACK PARIS FASHION",
    "BLUE COAT RACK PARIS FASHION",
    "ASSORTED COLOUR BIRD ORNAMENT",
    "POPPY'S PLAYHOUSE BEDROOM",
    "POPPY'S PLAYHOUSE KITCHEN",
    "FELTCRAFT PRINCESS CHARLOTTE DOLL",
    "IVORY KNITTED MUG COSY",
    "BOX OF 6 ASSORTED COLOUR TEASPOONS",
    "BOX OF VINTAGE JIGSAW BLOCKS",
    "BOX OF VINTAGE ALPHABET BLOCKS",
    "HOME BUILDING BLOCK WORD",
    "LOVE BUILDING BLOCK WORD",
    "RECIPE BOX WITH METAL HEART",
    "DOORMAT NEW ENGLAND",
    "BATH BUILDING BLOCK WORD",
    "ALARM CLOCK BAKELIKE PINK",
    "ALARM CLOCK BAKELIKE RED",
    "ALARM CLOCK BAKELIKE GREEN",
    "PANDA AND BUNNIES STICKER SHEET",
    "STARS GIFT TAPE",
    "INFLATABLE POLITICAL GLOBE",
    "VINTAGE HEADS AND TAILS CARD GAME",
    "SET/2 RED RETROSPOT TEA TOWELS",
    "ROUND SNACK BOXES SET OF4 WOODLAND",
    "SPACEBOY LUNCH BOX",
    "LUNCH BOX I LOVE LONDON",
    "CIRCUS PARADE LUNCH BOX",
    "CHARLOTTE BAG DOLLY GIRL DESIGN",
    "RED TOADSTOOL LED NIGHT LIGHT",
    "SET 2 TEA TOWELS I LOVE LONDON",
    "VINTAGE SEASIDE JIGSAW PUZZLES",
    "MINI JIGSAW CIRCUS PARADE",
    "MINI JIGSAW SPACEBOY",
    "MINI PAINT SET VINTAGE",
    "POSTAGE",
    "PAPER CHAIN KIT 50'S CHRISTMAS",
    "HAND WARMER RED POLKA DOT",
    "HAND WARMER UNION JACK",
    "WHITE HANGING HEART T-LIGHT HOLDER",
    "WHITE METAL LANTERN",
    "CREAM CUPID HEARTS COAT HANGER",
    "EDWARDIAN PARASOL RED",
    "RETRO COFFEE MUGS ASSORTED",
    "SAVE THE PLANET MUG",
    "VINTAGE BILLBOARD DRINK ME MUG",
    "VINTAGE BILLBOARD LOVE/HATE MUG",
    "WOOD 2 DRAWER CABINET WHITE FINISH",
    "WOOD S/3 CABINET ANT WHITE FINISH",
    "WOODEN PICTURE FRAME WHITE FINISH",
]

# Loop through the test items and display recommendations
for product_to_check in test_items:
    print(f"\nChecking recommendations for: {product_to_check}")
    recommended_products = recommend_products(product_to_check)
    print(f"Products recommended for '{product_to_check}': {recommended_products}")
    print_recommendation_proof(product_to_check)



Checking recommendations for: WHITE HANGING HEART T-LIGHT HOLDER
Products recommended for 'WHITE HANGING HEART T-LIGHT HOLDER': ['60 TEATIME FAIRY CAKE CASES', 'ASSORTED COLOUR BIRD ORNAMENT', 'CHOCOLATE HOT WATER BOTTLE', 'COOK WITH WINE METAL SIGN ', 'CREAM CUPID HEARTS COAT HANGER', 'CREAM HEART CARD HOLDER', 'GIN + TONIC DIET METAL SIGN', 'HAND OVER THE CHOCOLATE   SIGN ', 'HANGING HEART ZINC T-LIGHT HOLDER', 'HEART FILIGREE DOVE  SMALL', 'HEART FILIGREE DOVE LARGE', 'HEART IVORY TRELLIS LARGE', 'HEART OF WICKER LARGE', 'HEART OF WICKER SMALL', 'HOME BUILDING BLOCK WORD', 'JUMBO BAG RED RETROSPOT', 'JUMBO BAG STRAWBERRY', 'LOVE BUILDING BLOCK WORD', 'LUNCH BAG  BLACK SKULL.', 'LUNCH BAG CARS BLUE', 'LUNCH BAG RED SPOTTY', 'LUNCH BAG SUKI  DESIGN ', 'NATURAL SLATE HEART CHALKBOARD ', 'PACK OF 72 RETRO SPOT CAKE CASES', 'PLEASE ONE PERSON METAL SIGN', 'RED HANGING HEART T-LIGHT HOLDER', 'RED WOOLLY HOTTIE WHITE HEART.', 'REGENCY CAKESTAND 3 TIER', 'REX CASH+CARRY JUMBO SHOPPER', 'SC