In [4]:
import pandas as pd 
import numpy as np

#pip install mlxtend
from mlxtend.frequent_patterns import apriori, association_rules 

In [5]:
data_file = "OnlineRetail_short.xlsx"
data = pd.read_excel(data_file)
data.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536370,22728,ALARM CLOCK BAKELIKE PINK,24,2010-12-01 08:45:00,3.75,12583.0,France
1,536370,22727,ALARM CLOCK BAKELIKE RED,24,2010-12-01 08:45:00,3.75,12583.0,France
2,536370,22726,ALARM CLOCK BAKELIKE GREEN,12,2010-12-01 08:45:00,3.75,12583.0,France
3,536370,21724,PANDA AND BUNNIES STICKER SHEET,12,2010-12-01 08:45:00,0.85,12583.0,France
4,536370,21883,STARS GIFT TAPE,24,2010-12-01 08:45:00,0.65,12583.0,France


In [6]:
#show column names
data.columns

Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country'],
      dtype='object')

In [7]:
#show unique country names
np.unique(data.Country)

array(['Australia', 'Austria', 'Bahrain', 'Belgium', 'Brazil', 'Canada',
       'Channel Islands', 'Cyprus', 'Czech Republic', 'Denmark', 'EIRE',
       'European Community', 'Finland', 'France', 'Germany', 'Greece',
       'Hong Kong', 'Iceland', 'Israel', 'Italy', 'Japan', 'Lebanon',
       'Lithuania', 'Malta', 'Netherlands', 'Norway', 'Poland',
       'Portugal', 'RSA', 'Saudi Arabia', 'Singapore', 'Spain', 'Sweden',
       'Switzerland', 'USA', 'United Arab Emirates', 'Unspecified'],
      dtype=object)

In [8]:
#cleaning the data-------- remove extra spaces in the description 
data['Description'] = data['Description'].str.strip()

len(data)

46431

In [9]:
# Dropping the rows without any invoice number 
data.dropna(axis = 0, subset =['InvoiceNo'], inplace = True) 
data['InvoiceNo'] = data['InvoiceNo'].astype('str')

len(data)

46431

In [10]:
# Dropping all transactions which were done on credit 

# ~ (tilde) --> invert  .. so drop rows where invoice contains 'C'
data = data[~data['InvoiceNo'].str.contains('C')]

len(data)

44999

In [11]:
# only dataset from france
basket_france = (data[data['Country'] =="France"])

#basket.loc[basket['InvoiceNo']== '537468']
print(len(basket_france))
basket_france.head()

8408


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536370,22728,ALARM CLOCK BAKELIKE PINK,24,2010-12-01 08:45:00,3.75,12583.0,France
1,536370,22727,ALARM CLOCK BAKELIKE RED,24,2010-12-01 08:45:00,3.75,12583.0,France
2,536370,22726,ALARM CLOCK BAKELIKE GREEN,12,2010-12-01 08:45:00,3.75,12583.0,France
3,536370,21724,PANDA AND BUNNIES STICKER SHEET,12,2010-12-01 08:45:00,0.85,12583.0,France
4,536370,21883,STARS GIFT TAPE,24,2010-12-01 08:45:00,0.65,12583.0,France


In [12]:
# dataset of france only
basket = (data[data['Country'] == "France"]
          .groupby(['InvoiceNo', 'Description'])['Quantity']
          .sum().fillna(0)
         )

basket.head()

InvoiceNo  Description                    
536370     ALARM CLOCK BAKELIKE GREEN         12
           ALARM CLOCK BAKELIKE PINK          24
           ALARM CLOCK BAKELIKE RED           24
           CHARLOTTE BAG DOLLY GIRL DESIGN    20
           CIRCUS PARADE LUNCH BOX            24
Name: Quantity, dtype: int64

In [13]:
# unstack - put items as column

basket_France = (data[data['Country'] =="France"] 
          .groupby(['InvoiceNo', 'Description'])['Quantity'] 
          .sum().unstack().fillna(0)
        #   .reset_index().set_index('InvoiceNo')
            )

print(len(basket_France))
basket_France.head() 

392


Description,10 COLOUR SPACEBOY PEN,12 COLOURED PARTY BALLOONS,12 EGG HOUSE PAINTED WOOD,12 MESSAGE CARDS WITH ENVELOPES,12 PENCIL SMALL TUBE WOODLAND,12 PENCILS SMALL TUBE RED RETROSPOT,12 PENCILS SMALL TUBE SKULL,12 PENCILS TALL TUBE POSY,12 PENCILS TALL TUBE RED RETROSPOT,12 PENCILS TALL TUBE WOODLAND,...,WRAP VINTAGE PETALS DESIGN,YELLOW COAT RACK PARIS FASHION,YELLOW GIANT GARDEN THERMOMETER,YELLOW SHARK HELICOPTER,ZINC STAR T-LIGHT HOLDER,ZINC FOLKART SLEIGH BELLS,ZINC HERB GARDEN CONTAINER,ZINC METAL HEART DECORATION,ZINC T-LIGHT HOLDER STAR LARGE,ZINC T-LIGHT HOLDER STARS SMALL
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
536370,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536852,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536974,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
537065,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
537463,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# accessing first row.
basket_france.iloc[1]

InvoiceNo                        536370
StockCode                         22727
Description    ALARM CLOCK BAKELIKE RED
Quantity                             24
InvoiceDate         2010-12-01 08:45:00
UnitPrice                          3.75
CustomerID                        12583
Country                          France
Name: 1, dtype: object

In [15]:
# for portugal

basket_Portugal = (data[data['Country'] =="Portugal"] 
          .groupby(['InvoiceNo', 'Description'])['Quantity'] 
          .sum().unstack().fillna(0) 
          .reset_index().set_index('InvoiceNo')
          ) 
print('portugal',len(basket_Portugal))

# for sweden

basket_Sweden = (data[data['Country'] =="Sweden"] 
          .groupby(['InvoiceNo', 'Description'])['Quantity'] 
          .sum().unstack().fillna(0) 
          .reset_index().set_index('InvoiceNo')
          )
print('sweden',len(basket_Sweden))

portugal 58
sweden 36


In [16]:
# In this data set if the quanity is less than zero. it is inserted as negative. so to remove it.
def h_encode(x): 
    if(x<= 0): 
        return 0
    if(x>= 1): 
        return 1

In [17]:
# Encoding the datasets 
#apply the encoding function, elementwise i.e encode it (1 or 0)

basket_France = basket_France.applymap(h_encode) 
basket_Por = basket_Portugal.applymap(h_encode) 
basket_Sweden = basket_Sweden.applymap(h_encode) 

In [18]:
basket_France.head()

Description,10 COLOUR SPACEBOY PEN,12 COLOURED PARTY BALLOONS,12 EGG HOUSE PAINTED WOOD,12 MESSAGE CARDS WITH ENVELOPES,12 PENCIL SMALL TUBE WOODLAND,12 PENCILS SMALL TUBE RED RETROSPOT,12 PENCILS SMALL TUBE SKULL,12 PENCILS TALL TUBE POSY,12 PENCILS TALL TUBE RED RETROSPOT,12 PENCILS TALL TUBE WOODLAND,...,WRAP VINTAGE PETALS DESIGN,YELLOW COAT RACK PARIS FASHION,YELLOW GIANT GARDEN THERMOMETER,YELLOW SHARK HELICOPTER,ZINC STAR T-LIGHT HOLDER,ZINC FOLKART SLEIGH BELLS,ZINC HERB GARDEN CONTAINER,ZINC METAL HEART DECORATION,ZINC T-LIGHT HOLDER STAR LARGE,ZINC T-LIGHT HOLDER STARS SMALL
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
536370,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
536852,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
536974,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
537065,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
537463,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### minimum Support
The support and confidence are measures to measure how interesting a rule is.

The minimum support and minimum confidence are set by the users, and are parameters of the Apriori algorithm for association rule generation. These parameters are used to exclude rules in the result that have a support or a confidence lower than the minimum support and minimum confidence respectively.

So to answer your question, when you say that: "For an example when support and confidence is given as 60% and 60% respectively what is the minimum support?" you probably mean that you have set the minimum support and confidence to 60 %.

# Apriori Algorithm

#### antecedents 	consequents
antecedent means earlier (either in time or in order), whereas consequent means following as a result

Consequent is the second half of a hypothetical proposition. 
 - In the standard form of such a proposition, it is the part that follows "then". 
 - In an implication, if P implies Q, then P is called the antecedent and Q is called the consequent.
 
metric = 'lift' , 'confidence' 

The lift metric is commonly used to measure how much more often the antecedent (A) and consequent (C) of a rule A->C occur together (than we would expect if they were statistically independent)

In [19]:
# Building the model - France

frq_items = apriori(basket_France, min_support = 0.05, use_colnames = True)
print(frq_items.head())

# Collecting the inferred rules in a dataframe 

rules = association_rules(frq_items, metric ="lift", min_threshold = 1) 

rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False]) 
rules.head()


    support                       itemsets
0  0.071429  (4 TRADITIONAL SPINNING TOPS)
1  0.096939   (ALARM CLOCK BAKELIKE GREEN)
2  0.102041    (ALARM CLOCK BAKELIKE PINK)
3  0.094388     (ALARM CLOCK BAKELIKE RED)
4  0.068878   (ASSORTED COLOUR MINI CASES)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
45,(JUMBO BAG WOODLAND ANIMALS),(POSTAGE),0.076531,0.765306,0.076531,1.0,1.306667,0.017961,inf
259,"(PLASTERS IN TIN CIRCUS PARADE, RED TOADSTOOL ...",(POSTAGE),0.05102,0.765306,0.05102,1.0,1.306667,0.011974,inf
271,"(PLASTERS IN TIN WOODLAND ANIMALS, RED TOADSTO...",(POSTAGE),0.053571,0.765306,0.053571,1.0,1.306667,0.012573,inf
301,"(SET/6 RED SPOTTY PAPER CUPS, SET/20 RED RETRO...",(SET/6 RED SPOTTY PAPER PLATES),0.102041,0.127551,0.09949,0.975,7.644,0.086474,34.897959
302,"(SET/6 RED SPOTTY PAPER PLATES, SET/20 RED RET...",(SET/6 RED SPOTTY PAPER CUPS),0.102041,0.137755,0.09949,0.975,7.077778,0.085433,34.489796


paper cups and paper and plates are bought together in France. This is because the French have a culture of having a get-together with their friends and family atleast once a week. Also, since the French government has banned the use of plastic in the country, the people have to purchase the paper -based alternatives

In [20]:
# Building the model - Sweden
frq_items = apriori(basket_Sweden, min_support = 0.05, use_colnames = True)
print(frq_items.head())

# Collecting the inferred rules in a dataframe 
rules = association_rules(frq_items, metric ="lift", min_threshold = 1) #metric = 'lift'

rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False])
rules.head()

    support                           itemsets
0  0.055556      (12 PENCILS SMALL TUBE SKULL)
1  0.055556            (36 DOILIES DOLLY GIRL)
2  0.111111  (60 CAKE CASES DOLLY GIRL DESIGN)
3  0.111111  (60 CAKE CASES VINTAGE CHRISTMAS)
4  0.055556      (60 TEATIME FAIRY CAKE CASES)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(12 PENCILS SMALL TUBE SKULL),(PACK OF 72 SKULL CAKE CASES),0.055556,0.055556,0.055556,1.0,18.0,0.052469,inf
1,(PACK OF 72 SKULL CAKE CASES),(12 PENCILS SMALL TUBE SKULL),0.055556,0.055556,0.055556,1.0,18.0,0.052469,inf
4,(ASSORTED BOTTLE TOP MAGNETS),(36 DOILIES DOLLY GIRL),0.055556,0.055556,0.055556,1.0,18.0,0.052469,inf
5,(36 DOILIES DOLLY GIRL),(ASSORTED BOTTLE TOP MAGNETS),0.055556,0.055556,0.055556,1.0,18.0,0.052469,inf
180,(CHILDRENS CUTLERY DOLLY GIRL),(CHILDRENS CUTLERY CIRCUS PARADE),0.055556,0.055556,0.055556,1.0,18.0,0.052469,inf


On analyzing the above rules, it is found that boys’ and girls’ cutlery are paired together. This is perhaps, when a parent goes shopping for cutlery for his/her children, he/she would want the product to be a little customized according to the kid’s wishes.

In [21]:
# for USA
basket_usa = (data[data['Country'] =="USA"] 
          .groupby(['InvoiceNo', 'Description'])['Quantity'] 
          .sum().unstack().fillna(0) 
        #   .reset_index().set_index('InvoiceNo')
          )

basket_usa = basket_usa.applymap(h_encode)

# Building the model - USA
frq_items = apriori(basket_France, min_support = 0.05, use_colnames = True)
print(frq_items.head())

# Collecting the inferred rules in a dataframe 
rules = association_rules(frq_items, metric ="lift", min_threshold = 1) 
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False]) 
rules.head()


    support                       itemsets
0  0.071429  (4 TRADITIONAL SPINNING TOPS)
1  0.096939   (ALARM CLOCK BAKELIKE GREEN)
2  0.102041    (ALARM CLOCK BAKELIKE PINK)
3  0.094388     (ALARM CLOCK BAKELIKE RED)
4  0.068878   (ASSORTED COLOUR MINI CASES)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
45,(JUMBO BAG WOODLAND ANIMALS),(POSTAGE),0.076531,0.765306,0.076531,1.0,1.306667,0.017961,inf
259,"(PLASTERS IN TIN CIRCUS PARADE, RED TOADSTOOL ...",(POSTAGE),0.05102,0.765306,0.05102,1.0,1.306667,0.011974,inf
271,"(PLASTERS IN TIN WOODLAND ANIMALS, RED TOADSTO...",(POSTAGE),0.053571,0.765306,0.053571,1.0,1.306667,0.012573,inf
301,"(SET/6 RED SPOTTY PAPER CUPS, SET/20 RED RETRO...",(SET/6 RED SPOTTY PAPER PLATES),0.102041,0.127551,0.09949,0.975,7.644,0.086474,34.897959
302,"(SET/6 RED SPOTTY PAPER PLATES, SET/20 RED RET...",(SET/6 RED SPOTTY PAPER CUPS),0.102041,0.137755,0.09949,0.975,7.077778,0.085433,34.489796


In [25]:
rules = rules[['support','confidence','lift']]
rules.head()

Unnamed: 0,support,confidence,lift
45,0.076531,1.0,1.306667
259,0.05102,1.0,1.306667
271,0.053571,1.0,1.306667
301,0.09949,0.975,7.644
302,0.09949,0.975,7.077778
