ARL : Association Rule Learning

A rule-based machine learning technique used to find patterns in data. The Apriori Algorithm is used while the Association Rule Learning takes place. The Apriori algorithm calculates possible product pairs according to the support threshold value determined at the beginning of the process and creates the final table by making eliminations according to the support value determined in each iteration.

In this project after data preprosessing, Invoice-Item matrix is obtained which is needed for Apriori Algortihm. Afterwards, items that occur frequently together were found using the apriori algorithm and a rule table was obtained using the association rules method. After the rule table is created, a sorting is made according to the need and then the recommendation process is performed.

In [2]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
import warnings
warnings.filterwarnings("ignore")

In [3]:
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 500)
pd.set_option("display.expand_frame_repr", False)

In [4]:
df_original = pd.read_excel("online_retail_II.xlsx",
                            sheet_name = ["Year 2009-2010", "Year 2010-2011"])

In [5]:
df1 = df_original["Year 2009-2010"]
df2 = df_original["Year 2010-2011"]
df_ = df1.append(df2)

In [6]:
df = df_.copy()

df.shape

(1067371, 8)

In [7]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Quantity,1067371.0,9.938898,172.705794,-80995.0,1.0,3.0,10.0,80995.0
Price,1067371.0,4.649388,123.553059,-53594.36,1.25,2.1,4.15,38970.0
Customer ID,824364.0,15324.638504,1697.46445,12346.0,13975.0,15255.0,16797.0,18287.0


In [8]:
def retail_data_prep(dataframe):
    dataframe.dropna(inplace=True)  # eksisk değerlerin silinmesi
    dataframe = dataframe[~dataframe["Invoice"].str.contains("C", na=False)]
    dataframe = dataframe[dataframe["Quantity"] > 0]
    dataframe = dataframe[dataframe["Price"] > 0]
    return dataframe

In [9]:
df = retail_data_prep(df)

In [10]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Quantity,805549.0,13.290522,143.634088,1.0,2.0,5.0,12.0,80995.0
Price,805549.0,3.206561,29.199173,0.001,1.25,1.95,3.75,10953.5
Customer ID,805549.0,15331.95497,1696.737039,12346.0,13982.0,15271.0,16805.0,18287.0


In [11]:
df.isnull().sum()

Invoice        0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
Price          0
Customer ID    0
Country        0
dtype: int64

In [12]:
def outlier_threshold(dataframe, varibale):
    quartile1 = dataframe[varibale].quantile(0.01)
    quartile3 = dataframe[varibale].quantile(0.99)
    # the reason why 0.01 and 0.99 are used is arrange outliers without making harsh changes in dataset
    interquartile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquartile_range
    low_limit = quartile1 - 1.5 * interquartile_range
    return low_limit, up_limit

In [13]:
def replace_with_thresholds(dataframe, variable):
    low_limit, up_limit = outlier_threshold(dataframe, variable)
    dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
    dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit

In [14]:
replace_with_thresholds(df, "Quantity")
replace_with_thresholds(df, "Price")

In [15]:
df_fr = df[df["Country"] == "France"]
df_fr.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
71,489439,22065,CHRISTMAS PUDDING TRINKET POT,12.0,2009-12-01 09:28:00,1.45,12682.0,France
72,489439,22138,BAKING SET 9 PIECE RETROSPOT,9.0,2009-12-01 09:28:00,4.95,12682.0,France
73,489439,22139,RETRO SPOT TEA SET CERAMIC 11 PC,9.0,2009-12-01 09:28:00,4.95,12682.0,France
74,489439,22352,LUNCHBOX WITH CUTLERY RETROSPOT,12.0,2009-12-01 09:28:00,2.55,12682.0,France
75,489439,85014A,BLACK/BLUE DOTS RUFFLED UMBRELLA,3.0,2009-12-01 09:28:00,5.95,12682.0,France


In [16]:
df_fr.shape

(13812, 8)

In [17]:
df_fr.groupby(["Invoice", "StockCode"]).agg({"Quantity": "sum"}). \
    unstack(). \
    fillna(0). \
    applymap(lambda x: 1 if x > 0 else 0).iloc[0:8, 0:8]

Unnamed: 0_level_0,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity
StockCode,10002,10120,10125,10135,11001,15036,15039,16012
Invoice,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
489439,0,0,0,0,0,0,0,0
489557,0,0,0,0,0,0,0,0
489883,0,0,0,0,0,0,0,0
490139,0,0,0,0,0,0,0,0
490152,0,0,0,0,0,0,0,0
490458,1,0,0,0,0,0,0,0
490684,0,0,0,0,0,0,0,0
490959,1,0,0,0,0,0,0,0


In [18]:
def create_invoice_product_df(dataframe, id=False):
    if id:
        return dataframe.groupby(['Invoice', "StockCode"])['Quantity'].sum().unstack().fillna(0). \
            applymap(lambda x: 1 if x > 0 else 0)
    else:
        return dataframe.groupby(['Invoice', 'Description'])['Quantity'].sum().unstack().fillna(0). \
            applymap(lambda x: 1 if x > 0 else 0)

In [19]:
fr_inv_pro_df = create_invoice_product_df(df_fr, id=True)
# Having product names as variable names causes it to take up a lot of memory and the code to run slowly,
# so it is healthier to name the variables with their stockCodes, not the product names.

In [20]:
fr_inv_pro_df.iloc[0:8, 0:8]

StockCode,10002,10120,10125,10135,11001,15036,15039,16012
Invoice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
489439,0,0,0,0,0,0,0,0
489557,0,0,0,0,0,0,0,0
489883,0,0,0,0,0,0,0,0
490139,0,0,0,0,0,0,0,0
490152,0,0,0,0,0,0,0,0
490458,1,0,0,0,0,0,0,0
490684,0,0,0,0,0,0,0,0
490959,1,0,0,0,0,0,0,0


In [21]:
def check_id(dataframe, stock_code):
    product_name = dataframe[dataframe["StockCode"] == stock_code][["Description"]].values[0].tolist()
    print(product_name)

check_id(df_fr, 10002)

['INFLATABLE POLITICAL GLOBE ']


In [22]:
frequent_itemsets = apriori(fr_inv_pro_df,
                            min_support=0.01,
                            use_colnames=True)

frequent_itemsets.sort_values("support", ascending=False)

Unnamed: 0,support,itemsets
489,0.758958,(POST)
123,0.210098,(21731)
217,0.188925,(22352)
1953,0.187296,"(21731, POST)"
209,0.177524,(22326)
...,...,...
1465,0.011401,"(21137, 22181)"
8379,0.011401,"(22554, 21559, 20726, 22551)"
8380,0.011401,"(22556, 21559, 20726, 22551)"
8381,0.011401,"(POST, 21559, 20726, 22551)"


In [23]:
# association rules
rules = association_rules(frequent_itemsets,
                          metric="support",
                          min_threshold=0.01)


In [24]:
rules[(rules["support"] > 0.05)
      & (rules["confidence"] > 0.1)
      & (rules["lift"] > 5)].sort_values("confidence", ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
47334,"(21080, POST, 21086)",(21094),0.078176,0.127036,0.074919,0.958333,7.543803,0.064987,20.951140,0.941005
14591,"(21080, 21094)",(21086),0.096091,0.138436,0.091205,0.949153,6.856231,0.077903,16.944083,0.944949
14590,"(21080, 21086)",(21094),0.096091,0.127036,0.091205,0.949153,7.471534,0.078998,17.168295,0.958237
47336,"(21080, POST, 21094)",(21086),0.079805,0.138436,0.074919,0.938776,6.781273,0.063871,14.072204,0.926472
1585,(21094),(21086),0.127036,0.138436,0.115635,0.910256,6.575264,0.098049,9.600279,0.971305
...,...,...,...,...,...,...,...,...,...,...
5092,(22629),(22631),0.130293,0.087948,0.060261,0.462500,5.258796,0.048802,1.696841,0.931167
576,(20724),(22356),0.136808,0.084691,0.058632,0.428571,5.060440,0.047046,1.601792,0.929560
30422,(22629),"(POST, 22630)",0.130293,0.073290,0.055375,0.425000,5.798889,0.045825,1.611670,0.951531
30470,(22629),"(POST, 22631)",0.130293,0.071661,0.050489,0.387500,5.407386,0.041152,1.515655,0.937175


Product Recommendation

In [25]:
def arl_recommender(rules_df, product_id, rec_count=1):
    sorted_rules = rules_df.sort_values("lift", ascending=False)
    recommendation_list = []
    for i, product in enumerate(sorted_rules["antecedents"]):
        for j in list(product):
            if j == product_id:
                recommendation_list.append(list(sorted_rules.iloc[i]["consequents"])[0])

    return recommendation_list[0:rec_count]

In [26]:
arl_recommender(rules, 22492, 2)

[21914, 21080]

In [27]:
# checking the product names from the id of the products going to recommend

def check_id(dataframe, stock_code):
    product_names = []
    for i in stock_code:
        product_name = dataframe[dataframe["StockCode"] == i][["Description"]].values[0].tolist()
        print(f"{i} : {product_name}")


check_id(df_fr, arl_recommender(rules, 22492, 2))

21914 : ['BLUE HARMONICA IN BOX ']
21080 : ['SET/20 RED SPOTTY PAPER NAPKINS ']
