Apriori Algorithm

In [1]:
import numpy as np
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules


In [11]:

# Loading the Data
data = pd.read_csv('Online_Retail.csv')
data.head()


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [12]:
# Exploring the columns of the data
data.columns


Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country'],
      dtype='object')

In [13]:
# Exploring the different regions of transactions
data.Country.unique()


array(['United Kingdom', 'France', 'Australia', 'Netherlands', 'Germany',
       'Norway', 'EIRE', 'Switzerland', 'Spain', 'Poland', 'Portugal',
       'Italy', 'Belgium', 'Lithuania', 'Japan', 'Iceland',
       'Channel Islands', 'Denmark', 'Cyprus', 'Sweden', 'Austria',
       'Israel', 'Finland', 'Bahrain', 'Greece', 'Hong Kong', 'Singapore',
       'Lebanon', 'United Arab Emirates', 'Saudi Arabia',
       'Czech Republic', 'Canada', 'Unspecified', 'United Kingd'],
      dtype=object)

In [30]:
pip install apyori

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [14]:
# Stripping extra spaces in the description
data['Description'] = data['Description'].str.strip()

# Dropping the rows without any invoice number
data.dropna(axis = 0, subset =['InvoiceNo'], inplace = True)
data['InvoiceNo'] = data['InvoiceNo'].astype('str')

# Dropping all transactions which were done on credit
data = data[~data['InvoiceNo'].str.contains('C')]


In [15]:
# Transactions done in France
basket_France = (data[data['Country'] =="France"]
		.groupby(['InvoiceNo', 'Description'])['Quantity']
		.sum().unstack().reset_index().fillna(0)
		.set_index('InvoiceNo'))

# Transactions done in the United Kingdom
basket_UK = (data[data['Country'] =="United Kingdom"]
		.groupby(['InvoiceNo', 'Description'])['Quantity']
		.sum().unstack().reset_index().fillna(0)
		.set_index('InvoiceNo'))

# Transactions done in Portugal
basket_Por = (data[data['Country'] =="Portugal"]
		.groupby(['InvoiceNo', 'Description'])['Quantity']
		.sum().unstack().reset_index().fillna(0)
		.set_index('InvoiceNo'))

basket_Sweden = (data[data['Country'] =="Sweden"]
		.groupby(['InvoiceNo', 'Description'])['Quantity']
		.sum().unstack().reset_index().fillna(0)
		.set_index('InvoiceNo'))


In [16]:
# Defining the hot encoding function to make the data suitable
# for the concerned libraries
def hot_encode(x):
	if(x<= 0):
		return 0
	if(x>= 1):
		return 1

# Encoding the datasets
basket_encoded = basket_France.applymap(hot_encode)
basket_France = basket_encoded

basket_encoded = basket_UK.applymap(hot_encode)
basket_UK = basket_encoded

basket_encoded = basket_Por.applymap(hot_encode)
basket_Por = basket_encoded

basket_encoded = basket_Sweden.applymap(hot_encode)
basket_Sweden = basket_encoded


In [17]:
# Building the model
frq_items = apriori(basket_France, min_support = 0.05, use_colnames = True)

# Collecting the inferred rules in a dataframe
rules = association_rules(frq_items, metric ="lift", min_threshold = 1)
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False])
print(rules.head())


                                           antecedents  \
914               (POSTAGE, ALARM CLOCK BAKELIKE PINK)   
917  (ALARM CLOCK BAKELIKE PINK, ALARM CLOCK BAKELI...   
4                          (ALARM CLOCK BAKELIKE PINK)   
347  (ALARM CLOCK BAKELIKE ORANGE, ALARM CLOCK BAKE...   
358  (ALARM CLOCK BAKELIKE PINK, ALARM CLOCK BAKELI...   

                                           consequents  antecedent support  \
914  (ALARM CLOCK BAKELIKE RED, ALARM CLOCK BAKELIK...            0.058824   
917              (POSTAGE, ALARM CLOCK BAKELIKE GREEN)            0.058824   
4                         (ALARM CLOCK BAKELIKE GREEN)            0.068627   
347                         (ALARM CLOCK BAKELIKE RED)            0.058824   
358                       (ALARM CLOCK BAKELIKE GREEN)            0.058824   

     consequent support   support  confidence       lift  leverage  conviction  
914            0.068627  0.058824         1.0  14.571429  0.054787         inf  
917            0.068

In [18]:
frq_items = apriori(basket_UK, min_support = 0.01, use_colnames = True)
rules = association_rules(frq_items, metric ="lift", min_threshold = 1)
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False])
print(rules.head())


                                            antecedents       consequents  \
3367  (GREEN REGENCY TEACUP AND SAUCER, BEADED CRYST...  (DOTCOM POSTAGE)   
3373  (HEART DECORATION RUSTIC HANGING, BEADED CRYST...  (DOTCOM POSTAGE)   
3386  (BEADED CRYSTAL HEART PINK ON STICK, JAM MAKIN...  (DOTCOM POSTAGE)   
3392  (LOVEBIRD HANGING DECORATION WHITE, BEADED CRY...  (DOTCOM POSTAGE)   
3397  (VICTORIAN GLASS HANGING T-LIGHT, BEADED CRYST...  (DOTCOM POSTAGE)   

      antecedent support  consequent support   support  confidence       lift  \
3367            0.010716            0.047455  0.010716         1.0  21.072581   
3373            0.011098            0.047455  0.011098         1.0  21.072581   
3386            0.010333            0.047455  0.010333         1.0  21.072581   
3392            0.010142            0.047455  0.010142         1.0  21.072581   
3397            0.010142            0.047455  0.010142         1.0  21.072581   

      leverage  conviction  
3367  0.010207       