In [1]:
import numpy as np
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

In [2]:
!pip install xlrd==1.2.0



In [3]:
# Loading the Data
data = pd.read_excel('Online_Retail.xls')
data.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [4]:

# Exploring the columns of the data
data.columns

Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country'],
      dtype='object')

In [5]:

# Exploring the different regions of transactions
data.Country.unique()

array(['United Kingdom', 'France', 'Australia', 'Netherlands', 'Germany',
       'Norway', 'EIRE', 'Switzerland', 'Spain', 'Poland', 'Portugal',
       'Italy', 'Belgium', 'Lithuania', 'Japan', 'Iceland'], dtype=object)

In [6]:
# Stripping extra spaces in the description
data['Description'] = data['Description'].str.strip()
 
# Dropping the rows without any invoice number
data.dropna(axis = 0, subset =['InvoiceNo'], inplace = True)
data['InvoiceNo'] = data['InvoiceNo'].astype('str')
 
# Dropping all transactions which were done on credit
data = data[~data['InvoiceNo'].str.contains('C')]

In [7]:
# Transactions done in France
basket_France = (data[data['Country'] =="France"]
          .groupby(['InvoiceNo', 'Description'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))

In [8]:
# Defining the hot encoding function to make the data suitable
# for the concerned libraries
def hot_encode(x):
    if(x<= 0):
        return 0
    if(x>= 1):
        return 1
 
# Encoding the datasets
basket_encoded = basket_France.applymap(hot_encode)
df = basket_encoded

In [9]:
df

Description,10 COLOUR SPACEBOY PEN,5 HOOK HANGER RED MAGIC TOADSTOOL,ALARM CLOCK BAKELIKE GREEN,ALARM CLOCK BAKELIKE ORANGE,ALARM CLOCK BAKELIKE PINK,ALARM CLOCK BAKELIKE RED,ASSORTED COLOUR MINI CASES,BASKET OF TOADSTOOLS,BIG DOUGHNUT FRIDGE MAGNETS,BIRD HOUSE HOT WATER BOTTLE,...,STRAWBERRY LUNCH BOX WITH CUTLERY,TABLECLOTH RED APPLES DESIGN,TEA BAG PLATE RED RETROSPOT,TEA PARTY BIRTHDAY CARD,TOOL BOX SOFT TOY,TRADITIONAL WOODEN CATCH CUP GAME,VINTAGE HEADS AND TAILS CARD GAME,VINTAGE SEASIDE JIGSAW PUZZLES,WOODLAND STICKERS,WOODLAND CHARLOTTE BAG
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
536370,0,0,1,0,1,1,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
536852,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
536974,0,0,0,0,0,0,1,0,1,0,...,0,0,0,1,0,0,0,0,0,0
537065,0,1,1,1,1,1,1,1,0,0,...,1,0,0,0,1,1,0,0,0,0
537463,0,0,0,0,0,0,0,0,1,0,...,1,1,1,1,0,0,0,0,1,1
537468,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [10]:
from mlxtend.frequent_patterns import apriori
import time

def apriori_sort(df, min_support, sort = "none"):
  start_time = time.time()
  dfa = apriori(df, min_support, use_colnames=True)
  if sort == "sup":
    print("Сортировка по support")
    dfa = dfa.sort_values(by=['support'])
  if sort == "items":
    print("Сортировка по itemsets")
    dfa = dfa.sort_values(by=['itemsets'])
  endtime  = time.time() - start_time
  return [dfa,endtime]


In [23]:
min_support = 0.17
sort = "none"
dfa_list = apriori_sort(df, min_support, sort)

print("Time of execution: ", dfa_list[1])
print("Min support: ", min_support)
dfa_list[0]

Time of execution:  0.07795858383178711
Min support:  0.17


Unnamed: 0,support,itemsets
0,0.333333,(ALARM CLOCK BAKELIKE GREEN)
1,0.333333,(ALARM CLOCK BAKELIKE PINK)
2,0.333333,(ALARM CLOCK BAKELIKE RED)
3,0.333333,(ASSORTED COLOUR MINI CASES)
4,0.333333,(BIG DOUGHNUT FRIDGE MAGNETS)
...,...,...
452,0.333333,"(SET OF SALT AND PEPPER TOADSTOOLS, ROUND SNAC..."
453,0.333333,"(SET OF SALT AND PEPPER TOADSTOOLS, ROUND SNAC..."
454,0.333333,"(SET OF SALT AND PEPPER TOADSTOOLS, ROUND SNAC..."
455,0.333333,"(SET OF SALT AND PEPPER TOADSTOOLS, ROUND SNAC..."


In [28]:
min_support = 0.25
sort = "none"
dfa_list = apriori_sort(df, min_support, sort)

print("Time of execution: ", dfa_list[1])
print("Min support: ", min_support)
dfa_list[0]

Time of execution:  0.04500317573547363
Min support:  0.25


Unnamed: 0,support,itemsets
0,0.333333,(ALARM CLOCK BAKELIKE GREEN)
1,0.333333,(ALARM CLOCK BAKELIKE PINK)
2,0.333333,(ALARM CLOCK BAKELIKE RED)
3,0.333333,(ASSORTED COLOUR MINI CASES)
4,0.333333,(BIG DOUGHNUT FRIDGE MAGNETS)
...,...,...
452,0.333333,"(SET OF SALT AND PEPPER TOADSTOOLS, ROUND SNAC..."
453,0.333333,"(SET OF SALT AND PEPPER TOADSTOOLS, ROUND SNAC..."
454,0.333333,"(SET OF SALT AND PEPPER TOADSTOOLS, ROUND SNAC..."
455,0.333333,"(SET OF SALT AND PEPPER TOADSTOOLS, ROUND SNAC..."


In [36]:
min_support = 0.35
sort = "none"
dfa_list = apriori_sort(df, min_support, sort)

print("Time of execution: ", dfa_list[1])
print("Min support: ", min_support)
dfa_list[0]

Time of execution:  0.012159109115600586
Min support:  0.35


Unnamed: 0,support,itemsets
0,0.5,(LUNCH BAG RED RETROSPOT)
1,0.5,(LUNCH BOX WITH CUTLERY RETROSPOT)
2,1.0,(POSTAGE)
3,0.5,(ROUND SNACK BOXES SET OF4 WOODLAND)
4,0.5,(SET OF SALT AND PEPPER TOADSTOOLS)
5,0.5,"(LUNCH BAG RED RETROSPOT, POSTAGE)"
6,0.5,"(LUNCH BOX WITH CUTLERY RETROSPOT, POSTAGE)"
7,0.5,"(SET OF SALT AND PEPPER TOADSTOOLS, LUNCH BOX ..."
8,0.5,"(ROUND SNACK BOXES SET OF4 WOODLAND, POSTAGE)"
9,0.5,"(SET OF SALT AND PEPPER TOADSTOOLS, POSTAGE)"


In [44]:
min_support = 0.45
sort = "none"
dfa_list = apriori_sort(df, min_support, sort)

print("Time of execution: ", dfa_list[1])
print("Min support: ", min_support)
dfa_list[0]

Time of execution:  0.010229825973510742
Min support:  0.45


Unnamed: 0,support,itemsets
0,0.5,(LUNCH BAG RED RETROSPOT)
1,0.5,(LUNCH BOX WITH CUTLERY RETROSPOT)
2,1.0,(POSTAGE)
3,0.5,(ROUND SNACK BOXES SET OF4 WOODLAND)
4,0.5,(SET OF SALT AND PEPPER TOADSTOOLS)
5,0.5,"(LUNCH BAG RED RETROSPOT, POSTAGE)"
6,0.5,"(LUNCH BOX WITH CUTLERY RETROSPOT, POSTAGE)"
7,0.5,"(SET OF SALT AND PEPPER TOADSTOOLS, LUNCH BOX ..."
8,0.5,"(ROUND SNACK BOXES SET OF4 WOODLAND, POSTAGE)"
9,0.5,"(SET OF SALT AND PEPPER TOADSTOOLS, POSTAGE)"


In [46]:
min_support = 0.6
sort = "none"
dfa_list = apriori_sort(df, min_support, sort)

print("Time of execution: ", dfa_list[1])
print("Min support: ", min_support)
dfa_list[0]

Time of execution:  0.007476329803466797
Min support:  0.6


Unnamed: 0,support,itemsets
0,1.0,(POSTAGE)
