# Import Library

In [1]:
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori, association_rules

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


# Load Data

In [2]:
file_name = "/Users/user/Downloads/Online Retail.xlsx"
data = pd.read_excel(file_name)
data.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [3]:
data.columns

Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country'],
      dtype='object')

In [4]:
data.Country.unique()

array(['United Kingdom', 'France', 'Australia', 'Netherlands', 'Germany',
       'Norway', 'EIRE', 'Switzerland', 'Spain', 'Poland', 'Portugal',
       'Italy', 'Belgium', 'Lithuania', 'Japan', 'Iceland',
       'Channel Islands', 'Denmark', 'Cyprus', 'Sweden', 'Austria',
       'Israel', 'Finland', 'Bahrain', 'Greece', 'Hong Kong', 'Singapore',
       'Lebanon', 'United Arab Emirates', 'Saudi Arabia',
       'Czech Republic', 'Canada', 'Unspecified', 'Brazil', 'USA',
       'European Community', 'Malta', 'RSA'], dtype=object)

# Clean Data

In [5]:
data["Description"] = data["Description"].str.strip()

data.dropna(axis=0, subset=["InvoiceNo"], inplace=True)
data["InvoiceNo"] = data["InvoiceNo"].astype("str")

# Dropping all transactions which were done on credit 
data = data[~data['InvoiceNo'].str.contains('C')]

Splitting the data according to the region of transaction

In [6]:
# Transactions done in France 
basket_France = (data[data['Country'] =="France"] 
		.groupby(['InvoiceNo', 'Description'])['Quantity'] 
		.sum().unstack().reset_index().fillna(0) 
		.set_index('InvoiceNo')) 

# Transactions done in the United Kingdom 
basket_UK = (data[data['Country'] =="United Kingdom"] 
		.groupby(['InvoiceNo', 'Description'])['Quantity'] 
		.sum().unstack().reset_index().fillna(0) 
		.set_index('InvoiceNo')) 

# Transactions done in Portugal 
basket_Por = (data[data['Country'] =="Portugal"] 
		.groupby(['InvoiceNo', 'Description'])['Quantity'] 
		.sum().unstack().reset_index().fillna(0) 
		.set_index('InvoiceNo')) 

basket_Sweden = (data[data['Country'] =="Sweden"] 
		.groupby(['InvoiceNo', 'Description'])['Quantity'] 
		.sum().unstack().reset_index().fillna(0) 
		.set_index('InvoiceNo')) 

# OneHot Encoding data

In [7]:
# Defining the hot encoding function to make the data suitable 
# for the concerned libraries 
def hot_encode(x): 
	if(x<= 0): 
		return 0
	if(x>= 1): 
		return 1

# Encoding the datasets 
basket_encoded = basket_France.applymap(hot_encode) 
basket_France = basket_encoded 

basket_encoded = basket_UK.applymap(hot_encode) 
basket_UK = basket_encoded 

basket_encoded = basket_Por.applymap(hot_encode) 
basket_Por = basket_encoded 

basket_encoded = basket_Sweden.applymap(hot_encode) 
basket_Sweden = basket_encoded 


  basket_encoded = basket_France.applymap(hot_encode)
  basket_encoded = basket_UK.applymap(hot_encode)
  basket_encoded = basket_Por.applymap(hot_encode)
  basket_encoded = basket_Sweden.applymap(hot_encode)


# Building the models and analyzing the results
France

In [8]:
# Building the model 
frq_items = apriori(basket_France, min_support = 0.05, use_colnames = True) 

# Collecting the inferred rules in a dataframe 
rules = association_rules(frq_items, metric ="lift", min_threshold = 1) 
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False]) 
print(rules.head()) 


                                           antecedents  \
45                        (JUMBO BAG WOODLAND ANIMALS)   
260  (RED TOADSTOOL LED NIGHT LIGHT, PLASTERS IN TI...   
272  (RED TOADSTOOL LED NIGHT LIGHT, PLASTERS IN TI...   
301  (SET/20 RED RETROSPOT PAPER NAPKINS, SET/6 RED...   
302  (SET/6 RED SPOTTY PAPER PLATES, SET/20 RED RET...   

                         consequents  antecedent support  consequent support  \
45                         (POSTAGE)            0.076531            0.765306   
260                        (POSTAGE)            0.051020            0.765306   
272                        (POSTAGE)            0.053571            0.765306   
301  (SET/6 RED SPOTTY PAPER PLATES)            0.102041            0.127551   
302    (SET/6 RED SPOTTY PAPER CUPS)            0.102041            0.137755   

      support  confidence      lift  leverage  conviction  zhangs_metric  
45   0.076531       1.000  1.306667  0.017961         inf       0.254144  
260  0.051020       



United Kingdom

In [9]:
frq_items = apriori(basket_UK, min_support = 0.01, use_colnames = True) 
rules = association_rules(frq_items, metric ="lift", min_threshold = 1) 
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False]) 
print(rules.head()) 



                                       antecedents             consequents  \
116           (BEADED CRYSTAL HEART PINK ON STICK)        (DOTCOM POSTAGE)   
2018  (JAM MAKING SET PRINTED, SUKI  SHOULDER BAG)        (DOTCOM POSTAGE)   
2295         (HERB MARKER MINT, HERB MARKER THYME)  (HERB MARKER ROSEMARY)   
2302   (HERB MARKER ROSEMARY, HERB MARKER PARSLEY)     (HERB MARKER THYME)   
2301      (HERB MARKER THYME, HERB MARKER PARSLEY)  (HERB MARKER ROSEMARY)   

      antecedent support  consequent support   support  confidence       lift  \
116             0.011036            0.037928  0.010768    0.975728  25.725872   
2018            0.011625            0.037928  0.011196    0.963134  25.393807   
2295            0.010714            0.012375  0.010232    0.955000  77.173095   
2302            0.011089            0.012321  0.010553    0.951691  77.240055   
2301            0.011089            0.012375  0.010553    0.951691  76.905682   

      leverage  conviction  zhangs_metric  
