# Frequente item sets
Frequent Itemsets met A-Priori-algoritme

In [73]:
import pyodbc
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

# Database connectie

In [74]:
# database name
DB = {
    'servername': '(local)\\SQLEXPRESS',
    'database': 'DEDS_DataWarehouse'}

In [75]:
export_conn = pyodbc.connect('DRIVER={SQL Server};SERVER=' + DB['servername'] + ';DATABASE=' + DB['database'] + ';Trusted_Connection=yes')


# Create a cursor from the connection
export_cursor = export_conn.cursor()

# check if connection is successful, else throw an error
if export_conn:
    print("Connection with database is established")
else:
    print("Connection with database is not established")
    raise Exception("Connection with database is not established")

Connection with database is established


## Tabellen inlezen
Great Outdoors wil graag weten welke producten vaak samen gekocht worden door klanten.
Hiervoor hebben we informatie uit de volgende tabellen nodig:
- product
- order_details

In [76]:
product_query = "SELECT * FROM Product"

product_result = export_cursor.execute(product_query)
product_fetch = product_result.fetchall()
product_columns = [column[0] for column in product_result.description]
product = pd.DataFrame.from_records(product_fetch, columns=product_columns)

# Dropping current_value and last_updated columns
product = product.drop(columns=['CURRENT_VALUE', 'LAST_UPDATED'])

product

Unnamed: 0,PRODUCT_SK,PRODUCT_number,PRODUCT_name_product,PRODUCT_description_description,PRODUCT_image_image,PRODUCT_INTRODUCTION_DATE_introduced,PRODUCT_PRODUCTION_COST_cost,PRODUCT_MARGIN_margin,PRODUCT_LANGUAGE_language,PRODUCT_MINIMUM_SALE_PRICE_minPrice,PRODUCT_PRODUCT_LINE_code,PRODUCT_PRODUCT_LINE_code_en,PRODUCT_PRODUCT_TYPE_code,PRODUCT_PRODUCT_TYPE_code_en
0,1,1,TrailChef Water Bag,"Lightweight, collapsible bag to carry liquids ...",P01CE1CG1.jpg,2011-02-15,4.0000,0.3300,EN,4.3300,1,Camping Equipment,1,Cooking Gear
1,2,10,TrailChef Utensils,"Spoon, fork and knife set made of a light yet ...",P10CE1CG1.jpg,2011-02-15,10.0000,0.4000,EN,10.4000,1,Camping Equipment,1,Cooking Gear
2,3,100,Insect Bite Relief,The Insect Bite Relief helps the itching and s...,P100OP4FA17.jpg,2011-02-15,3.0000,0.5000,EN,3.5000,4,Outdoor Protection,17,First Aid
3,4,101,Hailstorm Steel Irons,Iron is 17-4 stainless steel. Shafts are grap...,P101GE5IR18.jpg,2019-12-15,305.5400,0.4300,EN,305.9700,5,Golf Equipment,18,Irons
4,5,102,Hailstorm Titanium Irons,Made entirely of pure titanium. The ultimate i...,P102GE5IR18.jpg,2019-10-12,380.9500,0.5100,EN,381.4600,5,Golf Equipment,18,Irons
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110,111,95,Sun Shield,"PABA free sunscreen, SPF 30, poison oak and iv...",P91OP4SS16.jpg,2011-02-15,3.0000,0.5000,EN,3.5000,4,Outdoor Protection,16,Sunscreen
111,112,96,Compact Relief Kit,A personal first aid kit is recommended for ev...,P96OP4FA17.jpg,2011-02-15,16.4300,0.2800,EN,16.7100,4,Outdoor Protection,17,First Aid
112,113,97,Deluxe Family Relief Kit,A complete medical kit suitable for families w...,P96OP4FA17.jpg,2013-05-03,25.0000,0.2800,EN,25.2800,4,Outdoor Protection,17,First Aid
113,114,98,Calamine Relief,Use the Calamine Relief for allergic skin reac...,P98OP4FA17.jpg,2011-02-15,3.0000,0.5000,EN,3.5000,4,Outdoor Protection,17,First Aid


In [77]:
order_details_query = "SELECT * FROM Order_details"

order_details_result = export_cursor.execute(order_details_query)
order_details_fetch = order_details_result.fetchall()
order_details_columns = [column[0] for column in order_details_result.description]
order_details = pd.DataFrame.from_records(order_details_fetch, columns=order_details_columns)

# Dropping current_value and last_updated columns
order_details = order_details.drop(columns=['CURRENT_VALUE', 'LAST_UPDATED'])

order_details

Unnamed: 0,ORDER_DETAILS_SK,ORDER_DETAILS_code,ORDER_DETAILS_QUANTITY_quantity,ORDER_DETAILS_TOTAL_COST_total,ORDER_DETAILS_TOTAL_MARGIN_margin,ORDER_DETAILS_ORDER_NUMBER_order,ORDER_DETAILS_PRODUCT_NUMBER_product,ORDER_DETAILS_UNIT_ID_unit
0,1,100000,16,257.6000,360.6400,4405,112,1
1,2,100001,20,322.0000,450.8000,5008,112,2
2,3,100002,24,386.4000,540.9600,4394,112,3
3,4,100003,18,289.8000,405.7200,4396,112,4
4,5,100004,20,322.0000,450.8000,4382,112,5
...,...,...,...,...,...,...,...,...
43058,43059,99995,146,402.9600,810.3000,4402,111,43059
43059,43060,99996,172,474.7200,954.6000,4400,111,43060
43060,43061,99997,192,529.9200,1065.6000,5148,111,43061
43061,43062,99998,192,529.9200,1065.6000,4384,111,43062


## Database connectie sluiten

In [78]:
export_cursor.close()
export_conn.close()

## Data samenvoegen

In [79]:
combined_data = pd.merge(order_details, product, left_on='ORDER_DETAILS_PRODUCT_NUMBER_product', right_on='PRODUCT_SK')

# dropping all sk columns
sk_columns = combined_data.filter(like='SK').columns
combined_data.drop(columns=sk_columns, inplace=True)

combined_data

Unnamed: 0,ORDER_DETAILS_code,ORDER_DETAILS_QUANTITY_quantity,ORDER_DETAILS_TOTAL_COST_total,ORDER_DETAILS_TOTAL_MARGIN_margin,ORDER_DETAILS_ORDER_NUMBER_order,ORDER_DETAILS_PRODUCT_NUMBER_product,ORDER_DETAILS_UNIT_ID_unit,PRODUCT_number,PRODUCT_name_product,PRODUCT_description_description,PRODUCT_image_image,PRODUCT_INTRODUCTION_DATE_introduced,PRODUCT_PRODUCTION_COST_cost,PRODUCT_MARGIN_margin,PRODUCT_LANGUAGE_language,PRODUCT_MINIMUM_SALE_PRICE_minPrice,PRODUCT_PRODUCT_LINE_code,PRODUCT_PRODUCT_LINE_code_en,PRODUCT_PRODUCT_TYPE_code,PRODUCT_PRODUCT_TYPE_code_en
0,100000,16,257.6000,360.6400,4405,112,1,96,Compact Relief Kit,A personal first aid kit is recommended for ev...,P96OP4FA17.jpg,2011-02-15,16.4300,0.2800,EN,16.7100,4,Outdoor Protection,17,First Aid
1,100001,20,322.0000,450.8000,5008,112,2,96,Compact Relief Kit,A personal first aid kit is recommended for ev...,P96OP4FA17.jpg,2011-02-15,16.4300,0.2800,EN,16.7100,4,Outdoor Protection,17,First Aid
2,100002,24,386.4000,540.9600,4394,112,3,96,Compact Relief Kit,A personal first aid kit is recommended for ev...,P96OP4FA17.jpg,2011-02-15,16.4300,0.2800,EN,16.7100,4,Outdoor Protection,17,First Aid
3,100003,18,289.8000,405.7200,4396,112,4,96,Compact Relief Kit,A personal first aid kit is recommended for ev...,P96OP4FA17.jpg,2011-02-15,16.4300,0.2800,EN,16.7100,4,Outdoor Protection,17,First Aid
4,100004,20,322.0000,450.8000,4382,112,5,96,Compact Relief Kit,A personal first aid kit is recommended for ev...,P96OP4FA17.jpg,2011-02-15,16.4300,0.2800,EN,16.7100,4,Outdoor Protection,17,First Aid
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43058,99995,146,402.9600,810.3000,4402,111,43059,95,Sun Shield,"PABA free sunscreen, SPF 30, poison oak and iv...",P91OP4SS16.jpg,2011-02-15,3.0000,0.5000,EN,3.5000,4,Outdoor Protection,16,Sunscreen
43059,99996,172,474.7200,954.6000,4400,111,43060,95,Sun Shield,"PABA free sunscreen, SPF 30, poison oak and iv...",P91OP4SS16.jpg,2011-02-15,3.0000,0.5000,EN,3.5000,4,Outdoor Protection,16,Sunscreen
43060,99997,192,529.9200,1065.6000,5148,111,43061,95,Sun Shield,"PABA free sunscreen, SPF 30, poison oak and iv...",P91OP4SS16.jpg,2011-02-15,3.0000,0.5000,EN,3.5000,4,Outdoor Protection,16,Sunscreen
43061,99998,192,529.9200,1065.6000,4384,111,43062,95,Sun Shield,"PABA free sunscreen, SPF 30, poison oak and iv...",P91OP4SS16.jpg,2011-02-15,3.0000,0.5000,EN,3.5000,4,Outdoor Protection,16,Sunscreen


## Data voorbereiden

In [80]:
# dropping all columns except ORDER_DETAILS_ORDER_NUMBER_order and PRODUCT_name_product
combined_data = combined_data[['ORDER_DETAILS_ORDER_NUMBER_order', 'PRODUCT_name_product']]

# Group the data by 'ORDER_DETAILS_ORDER_NUMBER_order'
grouped_data = combined_data.groupby('ORDER_DETAILS_ORDER_NUMBER_order')

# Apply dummy encoding to the 'PRODUCT_name_product' column
dummy_data = pd.get_dummies(combined_data['PRODUCT_name_product'])

# dropping the 'PRODUCT_name_product' column
combined_data = combined_data.drop(columns=['PRODUCT_name_product'])

# Join the dummy encoded DataFrame back to the original DataFrame
combined_data_encoded = pd.concat([combined_data, dummy_data], axis=1)

# Group by 'ORDER_DETAILS_ORDER_NUMBER_order' again and sum the dummy columns. This is needed to combine multiple rows for the same order into one row
final_data = combined_data_encoded.groupby('ORDER_DETAILS_ORDER_NUMBER_order').sum()

# Convert the dummy columns to boolean
final_data = final_data.astype(bool)

final_data

Unnamed: 0_level_0,Aloe Relief,Bear Edge,Bear Survival Edge,Blue Steel Max Putter,Blue Steel Putter,BugShield Extreme,BugShield Lotion,BugShield Lotion Lite,BugShield Natural,BugShield Spray,...,TrailChef Canteen,TrailChef Cook Set,TrailChef Cup,TrailChef Deluxe Cook Set,TrailChef Double Flame,TrailChef Kettle,TrailChef Kitchen Kit,TrailChef Single Flame,TrailChef Utensils,TrailChef Water Bag
ORDER_DETAILS_ORDER_NUMBER_order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,False,False,False,False,False,True,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
3,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
4,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
5,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5356,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5357,False,False,False,False,False,True,False,False,False,False,...,False,True,True,False,False,False,False,True,True,False
5358,True,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,True,False,False,False
5359,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,True,False,False,False,True,True


## A-Priori-algoritme
Support is de relatieve frequentie waarmee de regels opduiken. Als een product in 20% van de transacties voorkomt, is de support 0.2.

Confidence is een maat voor de betrouwbaarheid van de regel. Een confidence van 0.5 betekent dat in 50% van de gevallen waarin {A} en {B} werden gekocht, de aankoop ook {C} omvatte.

Lift is de verhouding tussen de waargenomen support en de verwachte support als A en C onafhankelijk zouden zijn. De vuistregel is dat een liftwaarde dicht bij 1 betekent dat de regels volledig onafhankelijk waren.

Rules zijn het resultaat van het algoritme voor associatieregels. Dit zijn de regels die frequent werden gevonden in de gegevens.

Antecedents zijn de items die worden gekocht ({A} -> {B}, A is de antecedent).
Consequents zijn de items die samen met het antecedent worden gekocht ({A} -> {B}, B is de consequent). Consequents zijn dus de voorwaarden die volgen op de antecedenten, de voorspellingen.


In [81]:
# Define the support and confidence values to try
support_values = [0.2, 0.1, 0.05, 0.01]
confidence_values = [0.3, 0.4, 0.5, 0.6, 0.7]

# Create a list to store the DataFrames
rules_list = []

frequent_itemsets_list = []

# Iterate over all combinations of support and confidence values
for support in support_values:
    for confidence in confidence_values:
        # Generate frequent itemsets with the current support value
        frequent_itemsets = apriori(final_data, min_support=support, use_colnames=True)
        
        # if the frequent_itemsets DataFrame is empty, skip this iteration.
        if frequent_itemsets.empty:
            continue
            
        # Append the frequent_itemsets DataFrame to the list
        frequent_itemsets_list.append(frequent_itemsets)
        
        # Generate the rules with the current confidence value. Filtering out rules where the confidence of the model is less than the confidence value
        # Rules refer to the relationship between the products
        rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=confidence)
        
        # Add the support and confidence values to the rules DataFrame
        rules['Min Support'] = support
        rules['Min Confidence'] = confidence

        # If the rules DataFrame is empty, skip this iteration.
        if rules.empty:
            continue
            
        # Append the rules DataFrame to the list
        rules_list.append(rules)
             
# Concatenate all the DataFrames in the list into a single DataFrame
apriori_df = pd.concat(rules_list, ignore_index=True)

# moving min confidence and min support to the front
cols = apriori_df.columns.tolist()
cols = cols[-2:] + cols[:-2] # kinda hacky way to move the last two columns to the front but oh well
apriori_df = apriori_df[cols]

apriori_df        

Unnamed: 0,Min Support,Min Confidence,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,0.01,0.3,(Blue Steel Max Putter),(Blue Steel Putter),0.066791,0.072761,0.020149,0.301676,4.146111,0.015289,1.327806,0.813119
1,0.01,0.3,(Blue Steel Max Putter),(Hailstorm Titanium Irons),0.066791,0.077985,0.025746,0.385475,4.942931,0.020538,1.500370,0.854783
2,0.01,0.3,(Hailstorm Titanium Irons),(Blue Steel Max Putter),0.077985,0.066791,0.025746,0.330144,4.942931,0.020538,1.393148,0.865160
3,0.01,0.3,(Blue Steel Max Putter),(Hailstorm Titanium Woods Set),0.066791,0.062873,0.022015,0.329609,5.242445,0.017816,1.397881,0.867168
4,0.01,0.3,(Hailstorm Titanium Woods Set),(Blue Steel Max Putter),0.062873,0.066791,0.022015,0.350148,5.242445,0.017816,1.436034,0.863543
...,...,...,...,...,...,...,...,...,...,...,...,...
131,0.01,0.4,"(Blue Steel Max Putter, Lady Hailstorm Titaniu...",(Lady Hailstorm Titanium Woods Set),0.019590,0.059142,0.010448,0.533333,9.017876,0.009289,2.016125,0.906874
132,0.01,0.4,"(Blue Steel Max Putter, Lady Hailstorm Titaniu...",(Lady Hailstorm Titanium Irons),0.022761,0.054664,0.010448,0.459016,8.397023,0.009204,1.747439,0.901428
133,0.01,0.4,"(Lady Hailstorm Titanium Irons, Lady Hailstorm...",(Blue Steel Max Putter),0.020336,0.066791,0.010448,0.513761,7.692071,0.009090,1.919241,0.888055
134,0.01,0.5,"(Blue Steel Max Putter, Lady Hailstorm Titaniu...",(Lady Hailstorm Titanium Woods Set),0.019590,0.059142,0.010448,0.533333,9.017876,0.009289,2.016125,0.906874


In [82]:
itemsets_df = pd.concat(frequent_itemsets_list, ignore_index=True)

# filtering for a support of at least 0.1
frequent_itemsets_df = itemsets_df[itemsets_df['support'] >= 0.1]
infrequent_itemsets_df = itemsets_df[itemsets_df['support'] < 0.1]

In [83]:
# displaying frequent itemsets
frequent_itemsets_df

Unnamed: 0,support,itemsets
0,0.106903,(Canyon Mule Carryall)
1,0.101119,(Double Edge)
2,0.104104,(Sun Shelter 30)
3,0.106903,(Canyon Mule Carryall)
4,0.101119,(Double Edge)
5,0.104104,(Sun Shelter 30)
6,0.106903,(Canyon Mule Carryall)
7,0.101119,(Double Edge)
8,0.104104,(Sun Shelter 30)
9,0.106903,(Canyon Mule Carryall)


In [84]:
# displaying infrequent itemsets
infrequent_itemsets_df

Unnamed: 0,support,itemsets
15,0.057463,(Aloe Relief)
16,0.082649,(Bear Edge)
17,0.073507,(Bear Survival Edge)
18,0.066791,(Blue Steel Max Putter)
19,0.072761,(Blue Steel Putter)
...,...,...
10080,0.016978,"(TrailChef Water Bag, TrailChef Kitchen Kit)"
10081,0.014552,"(TrailChef Single Flame, TrailChef Utensils)"
10082,0.012313,"(TrailChef Single Flame, TrailChef Water Bag)"
10083,0.015672,"(TrailChef Water Bag, TrailChef Utensils)"
