In [3]:
import pandas as pd
import random as rnd
import re
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
import os
%matplotlib inline
sns.set(color_codes=True)

In [4]:
os.chdir('F:/Carpeta Drive/Python/Base')

In [5]:
df=pd.read_csv('data.csv', encoding = "ISO-8859-1")

In [6]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


In [7]:
df['Description'] = df['Description'].str.strip()
df.dropna(axis=0, subset=['InvoiceNo'], inplace=True)
df['InvoiceNo'] = df['InvoiceNo'].astype('str')
df = df[~df['InvoiceNo'].str.contains('C')]

In [8]:
basket = (df[df['Country'] =="France"]
          .groupby(['InvoiceNo', 'Description'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))

In [9]:
df[df['Country'] =="France"].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8408 entries, 26 to 541908
Data columns (total 8 columns):
InvoiceNo      8408 non-null object
StockCode      8408 non-null object
Description    8408 non-null object
Quantity       8408 non-null int64
InvoiceDate    8408 non-null object
UnitPrice      8408 non-null float64
CustomerID     8342 non-null float64
Country        8408 non-null object
dtypes: float64(2), int64(1), object(5)
memory usage: 591.2+ KB


In [10]:
len(df[df['Country'] =="France"]['InvoiceNo'].unique())

392

In [11]:
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1

basket_sets = basket.applymap(encode_units)
basket_sets.drop('POSTAGE', inplace=True, axis=1)

In [12]:
frequent_itemsets = apriori(basket_sets, min_support=0.07, use_colnames=True)

In [13]:
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
rules.head()

Unnamed: 0,antecedants,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,"(SET/20 RED RETROSPOT PAPER NAPKINS, SET/6 RED...",(SET/6 RED SPOTTY PAPER PLATES),0.102041,0.127551,0.09949,0.975,7.644,0.086474,34.897959
1,"(SET/6 RED SPOTTY PAPER CUPS, SET/6 RED SPOTTY...",(SET/20 RED RETROSPOT PAPER NAPKINS),0.122449,0.132653,0.09949,0.8125,6.125,0.083247,4.62585
2,"(SET/20 RED RETROSPOT PAPER NAPKINS, SET/6 RED...",(SET/6 RED SPOTTY PAPER CUPS),0.102041,0.137755,0.09949,0.975,7.077778,0.085433,34.489796
3,(SET/6 RED SPOTTY PAPER CUPS),"(SET/20 RED RETROSPOT PAPER NAPKINS, SET/6 RED...",0.137755,0.102041,0.09949,0.722222,7.077778,0.085433,3.232653
4,(SET/20 RED RETROSPOT PAPER NAPKINS),"(SET/6 RED SPOTTY PAPER CUPS, SET/6 RED SPOTTY...",0.132653,0.122449,0.09949,0.75,6.125,0.083247,3.510204


In [34]:
df_rules=pd.DataFrame(rules.iloc[:,[0,1,4,5,6]])
df_rules.columns=["From","To","Support","Confidence","Lift"]
df_rules['Support']=df_rules['Support']*100
df_rules['Support']=df_rules["Support"].map('{:,.2f}%'.format)
df_rules['Confidence']=df_rules['Confidence']*100
df_rules['Confidence']=df_rules["Confidence"].map('{:,.2f}%'.format)

In [35]:
df_rules.sort_values(by=['Confidence', 'Support','Lift'],ascending=False).head(10)

Unnamed: 0,From,To,Support,Confidence,Lift
0,"(SET/20 RED RETROSPOT PAPER NAPKINS, SET/6 RED...",(SET/6 RED SPOTTY PAPER PLATES),9.95%,97.50%,7.644
2,"(SET/20 RED RETROSPOT PAPER NAPKINS, SET/6 RED...",(SET/6 RED SPOTTY PAPER CUPS),9.95%,97.50%,7.077778
25,(SET/6 RED SPOTTY PAPER PLATES),(SET/6 RED SPOTTY PAPER CUPS),12.24%,96.00%,6.968889
24,(SET/6 RED SPOTTY PAPER CUPS),(SET/6 RED SPOTTY PAPER PLATES),12.24%,88.89%,6.968889
13,(ALARM CLOCK BAKELIKE RED),(ALARM CLOCK BAKELIKE GREEN),7.91%,83.78%,8.642959
12,(ALARM CLOCK BAKELIKE GREEN),(ALARM CLOCK BAKELIKE RED),7.91%,81.58%,8.642959
1,"(SET/6 RED SPOTTY PAPER CUPS, SET/6 RED SPOTTY...",(SET/20 RED RETROSPOT PAPER NAPKINS),9.95%,81.25%,6.125
17,(SET/6 RED SPOTTY PAPER PLATES),(SET/20 RED RETROSPOT PAPER NAPKINS),10.20%,80.00%,6.030769
21,(ALARM CLOCK BAKELIKE RED),(ALARM CLOCK BAKELIKE PINK),7.40%,78.38%,7.681081
5,(SET/6 RED SPOTTY PAPER PLATES),"(SET/20 RED RETROSPOT PAPER NAPKINS, SET/6 RED...",9.95%,78.00%,7.644
