Step 1: Importing the required libraries

In [2]:
import numpy as np
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

Step 2: Loading and exploring the data

In [3]:
# Loading the Data
data = pd.read_excel('https://archive.ics.uci.edu/ml/machine-learning-databases/00352/Online%20Retail.xlsx')
data.head()


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


Step 3: Cleaning the data

In [8]:
# Stripping extra spaces in the description
data['Description'] = data['Description'].str.strip()

# Dropping the rows without any invoice number
data.dropna(axis = 0, subset =['InvoiceNo'], inplace = True)
data['InvoiceNo'] = data['InvoiceNo'].astype('str')

# Dropping all transactions which were done on credit
data = data[~data['InvoiceNo'].str.contains('C')]


Step 2: Loading and exploring the data

In [5]:
def load_data(data, country):
  basket = (data[data['Country'] == country]
		.groupby(['InvoiceNo', 'Description'])['Quantity']
		.sum().unstack().reset_index().fillna(0)
		.set_index('InvoiceNo'))
  return basket

In [12]:
# Transactions done in France
basket_france = load_data(data, "France")

# Transactions done in the United Kingdom
basket_uk = load_data(data, "United Kingdom")

Step 5: Hot encoding the Data

In [13]:
# Defining the hot encoding function to make the data suitable
# for the concerned libraries
def hot_encode(x):
	if(x <= 0):
		return 0
	if(x >= 1):
		return 1

# Encoding the datasets
basket_encoded = basket_france.applymap(hot_encode)
basket_france = basket_encoded

basket_encoded = basket_uk.applymap(hot_encode)
basket_uk = basket_encoded

Step 6: Building the models and analyzing the results

In [14]:
frq_items = apriori(basket_uk, min_support = 0.01, use_colnames = True)
rules = association_rules(frq_items, metric ="lift", min_threshold = 1)
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False])
print(rules.head())

                                       antecedents  ... conviction
117           (BEADED CRYSTAL HEART PINK ON STICK)  ...  39.637371
2019  (SUKI  SHOULDER BAG, JAM MAKING SET PRINTED)  ...  26.096206
2296         (HERB MARKER THYME, HERB MARKER MINT)  ...  21.947227
2301   (HERB MARKER ROSEMARY, HERB MARKER PARSLEY)  ...  20.444951
2302      (HERB MARKER THYME, HERB MARKER PARSLEY)  ...  20.443842

[5 rows x 9 columns]


In [15]:
frq_items = apriori(basket_france, min_support = 0.01, use_colnames = True)
rules = association_rules(frq_items, metric ="lift", min_threshold = 1)
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False])
print(rules.head())

                                             antecedents  ... conviction
67422  (ALARM CLOCK BAKELIKE CHOCOLATE, ALARM CLOCK B...  ...        inf
67427  (DINOSAUR LUNCH BOX WITH CUTLERY, PLASTERS IN ...  ...        inf
69602  (SKULL LUNCH BOX WITH CUTLERY, ALARM CLOCK BAK...  ...        inf
69607  (ALARM CLOCK BAKELIKE GREEN, BUNDLE OF 3 ALPHA...  ...        inf
71644           (ALARM CLOCK BAKELIKE GREEN, PHOTO CUBE)  ...        inf

[5 rows x 9 columns]
