In [1]:
import numpy as np  
import pandas as pd  
from mlxtend.frequent_patterns import apriori, association_rules  

In [2]:
# Now, we will load the Data  
data1 = pd.read_excel('Online Retail.xlsx')  
data1.head()

Unnamed: 0,InvoiceNo,StockCode,lower,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,white hanging heart t-light holder,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,white metal lantern,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,cream cupid hearts coat hanger,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,knitted union flag hot water bottle,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,red woolly hottie white heart.,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [3]:
# here, we will explore the columns of the data  
data1.columns

Index(['InvoiceNo', 'StockCode', 'lower', 'Description', 'Quantity',
       'InvoiceDate', 'UnitPrice', 'CustomerID', 'Country'],
      dtype='object')

In [4]:
# Now, we will explore the different regions of transactions  
data1.Country.unique()  

array(['United Kingdom', 'France', 'Australia', 'Netherlands', 'Germany',
       'Norway', 'EIRE', 'Switzerland', 'Spain', 'Poland', 'Portugal',
       'Italy', 'Belgium', 'Lithuania', 'Japan', 'Iceland',
       'Channel Islands', 'Denmark', 'Cyprus', 'Sweden', 'Austria',
       'Israel', 'Finland', 'Bahrain', 'Greece', 'Hong Kong', 'Singapore',
       'Lebanon', 'United Arab Emirates', 'Saudi Arabia',
       'Czech Republic', 'Canada', 'Unspecified', 'Brazil', 'USA',
       'European Community', 'Malta', 'RSA'], dtype=object)

In [5]:
# here, we will strip the extra spaces in the description  
data1['Description'] = data1['Description'].str.strip()  
  
# Now, drop the rows which does not have any invoice number  
data1.dropna(axis = 0, subset = ['InvoiceNo'], inplace = True)  
data1['InvoiceNo'] = data1['InvoiceNo'].astype('str')  
  
# Now, we will drop all transactions which were done on credit  
data1 = data1[~data1['InvoiceNo'].str.contains('C')] 
 
print(data1)

       InvoiceNo StockCode                                lower  \
0         536365    85123A   white hanging heart t-light holder   
1         536365     71053                  white metal lantern   
2         536365    84406B       cream cupid hearts coat hanger   
3         536365    84029G  knitted union flag hot water bottle   
4         536365    84029E       red woolly hottie white heart.   
...          ...       ...                                  ...   
541904    581587     22613                                  NaN   
541905    581587     22899                                  NaN   
541906    581587     23254                                  NaN   
541907    581587     23255                                  NaN   
541908    581587     22138                                  NaN   

                                Description  Quantity         InvoiceDate  \
0        WHITE HANGING HEART T-LIGHT HOLDER         6 2010-12-01 08:26:00   
1                       WHITE METAL LANTE

In [6]:
# Transactions done in France  
basket1_France = (data1[data1['Country'] == "France"]  
        .groupby(['InvoiceNo', 'Description'])['Quantity']  
        .sum().unstack().reset_index().fillna(0)  
        .set_index('InvoiceNo'))  
  
# Transactions done in the United Kingdom  
basket1_UK = (data1[data1['Country'] == "United Kingdom"]  
        .groupby(['InvoiceNo', 'Description'])['Quantity']  
        .sum().unstack().reset_index().fillna(0)  
        .set_index('InvoiceNo'))  
  
# Transactions done in Portugal  
basket1_Por = (data1[data1['Country'] == "Portugal"]  
        .groupby(['InvoiceNo', 'Description'])['Quantity']  
        .sum().unstack().reset_index().fillna(0)  
        .set_index('InvoiceNo'))  
  
basket1_Sweden = (data1[data1['Country'] == "Sweden"]  
        .groupby(['InvoiceNo', 'Description'])['Quantity']  
        .sum().unstack().reset_index().fillna(0)  
        .set_index('InvoiceNo'))  

In [7]:
# Here, we will define the hot encoding function   
# for making the data suitable  
# for the concerned libraries  
def hot_encode1(P):  
    if(P<= 0):  
        return 0  
    if(P>= 1):  
        return 1  
  
# Here, we will encode the datasets  
basket1_encoded = basket1_France.applymap(hot_encode1)  
basket1_France = basket1_encoded  
  
basket1_encoded = basket1_UK.applymap(hot_encode1)  
basket1_UK = basket1_encoded  
  
basket1_encoded = basket1_Por.applymap(hot_encode1)  
basket1_Por = basket1_encoded  
  
basket1_encoded = basket1_Sweden.applymap(hot_encode1)  
basket1_Sweden = basket1_encoded  

# # Frequent Items

In [41]:
apriori(basket1_France,min_support=0.05)



Unnamed: 0,support,itemsets
0,0.071429,(35)
1,0.096939,(61)
2,0.102041,(64)
3,0.094388,(65)
4,0.068878,(82)
...,...,...
190,0.102041,"(1267, 1268, 974)"
191,0.099490,"(1248, 1267, 1268)"
192,0.056122,"(64, 65, 61, 974)"
193,0.053571,"(953, 956, 974, 951)"


## Model Training

In [38]:
# Building the model
frq_items = apriori(basket1_France, min_support = 0.05, use_colnames = True)



## Association rules

In [40]:
# Collecting the inferred rules in a dataframe
rules = association_rules(frq_items, metric ="lift", min_threshold = 1)

# Convert the confidence column to a float type
rules['confidence_values'] = rules['confidence'].astype(float)

rules.sort_values('confidence', ascending = False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,confidence_values
270,"(RED TOADSTOOL LED NIGHT LIGHT, PLASTERS IN TI...",(POSTAGE),0.053571,0.765306,0.053571,1.000000,1.306667,0.012573,inf,1.000000
258,"(RED TOADSTOOL LED NIGHT LIGHT, PLASTERS IN TI...",(POSTAGE),0.051020,0.765306,0.051020,1.000000,1.306667,0.011974,inf,1.000000
44,(JUMBO BAG WOODLAND ANIMALS),(POSTAGE),0.076531,0.765306,0.076531,1.000000,1.306667,0.017961,inf,1.000000
301,"(SET/20 RED RETROSPOT PAPER NAPKINS, SET/6 RED...",(SET/6 RED SPOTTY PAPER PLATES),0.102041,0.127551,0.099490,0.975000,7.644000,0.086474,34.897959,0.975000
300,"(SET/20 RED RETROSPOT PAPER NAPKINS, SET/6 RED...",(SET/6 RED SPOTTY PAPER CUPS),0.102041,0.137755,0.099490,0.975000,7.077778,0.085433,34.489796,0.975000
...,...,...,...,...,...,...,...,...,...,...
96,(POSTAGE),(PARTY BUNTING),0.765306,0.056122,0.051020,0.066667,1.187879,0.008070,1.011297,0.066667
225,(POSTAGE),"(LUNCH BAG RED RETROSPOT, LUNCH BAG WOODLAND)",0.765306,0.056122,0.051020,0.066667,1.187879,0.008070,1.011297,0.066667
263,(POSTAGE),"(RED TOADSTOOL LED NIGHT LIGHT, PLASTERS IN TI...",0.765306,0.051020,0.051020,0.066667,1.306667,0.011974,1.016764,0.066667
36,(POSTAGE),(JAM MAKING SET PRINTED),0.765306,0.053571,0.051020,0.066667,1.244444,0.010022,1.014031,0.066667


## Association rules which have highest confidence

In [34]:
# Filter the rules by confidence
high_confidence_rules = rules[rules['confidence_values'] > 0.8]

# Print the high confidence rules
for index, row in high_confidence_rules.iterrows():
    print('Rule:', row['antecedents'], '->', row['consequents'])
    print('Confidence:', row['confidence_values'])

Rule: frozenset({'ALARM CLOCK BAKELIKE GREEN'}) -> frozenset({'ALARM CLOCK BAKELIKE RED'})
Confidence: 0.8157894736842106
Rule: frozenset({'ALARM CLOCK BAKELIKE RED'}) -> frozenset({'ALARM CLOCK BAKELIKE GREEN'})
Confidence: 0.8378378378378379
Rule: frozenset({'ALARM CLOCK BAKELIKE GREEN'}) -> frozenset({'POSTAGE'})
Confidence: 0.868421052631579
Rule: frozenset({'ALARM CLOCK BAKELIKE PINK'}) -> frozenset({'POSTAGE'})
Confidence: 0.875
Rule: frozenset({'ALARM CLOCK BAKELIKE RED'}) -> frozenset({'POSTAGE'})
Confidence: 0.918918918918919
Rule: frozenset({'ASSORTED COLOUR MINI CASES'}) -> frozenset({'POSTAGE'})
Confidence: 0.925925925925926
Rule: frozenset({'CHARLOTTE BAG DOLLY GIRL DESIGN'}) -> frozenset({'POSTAGE'})
Confidence: 0.8846153846153846
Rule: frozenset({'CHILDRENS APRON SPACEBOY DESIGN'}) -> frozenset({'POSTAGE'})
Confidence: 0.8076923076923076
Rule: frozenset({'CHILDRENS CUTLERY DOLLY GIRL'}) -> frozenset({'CHILDRENS CUTLERY SPACEBOY'})
Confidence: 0.8928571428571429
Rule: fro

In [35]:
high_confidence_rules.sort_values('confidence', ascending = False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,confidence_values
270,"(RED TOADSTOOL LED NIGHT LIGHT, PLASTERS IN TI...",(POSTAGE),0.053571,0.765306,0.053571,1.000000,1.306667,0.012573,inf,1.000000
258,"(RED TOADSTOOL LED NIGHT LIGHT, PLASTERS IN TI...",(POSTAGE),0.051020,0.765306,0.051020,1.000000,1.306667,0.011974,inf,1.000000
44,(JUMBO BAG WOODLAND ANIMALS),(POSTAGE),0.076531,0.765306,0.076531,1.000000,1.306667,0.017961,inf,1.000000
300,"(SET/20 RED RETROSPOT PAPER NAPKINS, SET/6 RED...",(SET/6 RED SPOTTY PAPER CUPS),0.102041,0.137755,0.099490,0.975000,7.077778,0.085433,34.489796,0.975000
301,"(SET/20 RED RETROSPOT PAPER NAPKINS, SET/6 RED...",(SET/6 RED SPOTTY PAPER PLATES),0.102041,0.127551,0.099490,0.975000,7.644000,0.086474,34.897959,0.975000
...,...,...,...,...,...,...,...,...,...,...
23,(CHILDRENS APRON SPACEBOY DESIGN),(POSTAGE),0.066327,0.765306,0.053571,0.807692,1.055385,0.002811,1.220408,0.807692
178,"(ALARM CLOCK BAKELIKE GREEN, ALARM CLOCK BAKEL...",(ALARM CLOCK BAKELIKE PINK),0.079082,0.102041,0.063776,0.806452,7.903226,0.055706,4.639456,0.806452
118,(PLASTERS IN TIN WOODLAND ANIMALS),(POSTAGE),0.170918,0.765306,0.137755,0.805970,1.053134,0.006950,1.209576,0.805970
265,"(PLASTERS IN TIN WOODLAND ANIMALS, PLASTERS IN...",(POSTAGE),0.104592,0.765306,0.084184,0.804878,1.051707,0.004139,1.202806,0.804878
