In [50]:
# IMPORT STATEMENTS

#Import Python packages
import pandas as pd
import json
import matplotlib.pyplot as plt
import json

# Import Apriori modules
from mlxtend.frequent_patterns import apriori, association_rules

# Import Snowflake modules
from snowflake.snowpark import Session

In [51]:
# Get account credentials from a json file
with open("data_scientist_auth.json") as f:
    data = json.load(f)
    username = data["username"]
    password = data["password"]
    account = data["account"]

# Specify connection parameters
connection_parameters = {
    "account": account,
    "user": username,
    "password": password,
    "role": "TASTY_BI",
    "warehouse": "TASTY_BI_WH",
    "database": "frostbyte_tasty_bytes",
    "schema": "analytics",
}

# Create Snowpark session
session = Session.builder.configs(connection_parameters).create()

In [52]:
# retrieve order details usa table from snowflake
data_from_snowflake = session.table("frostbyte_tasty_bytes.analytics.ORDER_DETAILS_USA_MATCHED")

In [53]:
# convert data_from_snowflake to pandas dataframe
df = data_from_snowflake.to_pandas()

In [54]:
# preview data
df.head()

Unnamed: 0,ORDER_ID,CUSTOMER_ID,TRUCK_ID,LOCATION_ID,SHIFT_ID,SHIFT_START_TIME,SHIFT_END_TIME,ORDER_CHANNEL,ORDER_TS,SERVED_TS,...,E_MAIL,PHONE_NUMBER,ORDER_DETAIL_ID,MENU_ITEM_ID,DISCOUNT_ID,LINE_NUMBER,QUANTITY,UNIT_PRICE,PRICE,ORDER_ITEM_DISCOUNT_AMOUNT
0,447758081,101191,61,15475.0,200524445,15:30:00,22:30:00,,2022-09-22 18:13:43,,...,Zain.Sullivan@hotmail.com,659-899-8290,872457141,19,,0,1,3.0,3.0,
1,447758081,101191,61,15475.0,200524445,15:30:00,22:30:00,,2022-09-22 18:13:43,,...,Zain.Sullivan@hotmail.com,659-899-8290,872457142,11,,1,1,6.0,6.0,
2,447758081,101191,61,15475.0,200524445,15:30:00,22:30:00,,2022-09-22 18:13:43,,...,Zain.Sullivan@hotmail.com,659-899-8290,872457143,15,,2,1,3.0,3.0,
3,447759739,195383,62,2588.0,200524447,15:30:00,22:30:00,,2022-09-22 19:46:15,,...,Aryana.Dennis@hotmail.com,437-446-0786,872462265,24,,0,1,2.0,2.0,
4,447759739,195383,62,2588.0,200524447,15:30:00,22:30:00,,2022-09-22 19:46:15,,...,Aryana.Dennis@hotmail.com,437-446-0786,872462266,27,,1,1,6.0,6.0,


In [55]:
df.isnull().sum()

ORDER_ID                           0
CUSTOMER_ID                        0
TRUCK_ID                           0
LOCATION_ID                        0
SHIFT_ID                           0
SHIFT_START_TIME                   0
SHIFT_END_TIME                     0
ORDER_CHANNEL                 125999
ORDER_TS                           0
SERVED_TS                     125999
ORDER_CURRENCY                     0
ORDER_AMOUNT                       0
ORDER_TAX_AMOUNT              125999
ORDER_DISCOUNT_AMOUNT         125999
ORDER_TOTAL                        0
MAX_ORDER_TS                       0
FIRST_NAME                         0
LAST_NAME                          0
CITY                               0
COUNTRY                            0
POSTAL_CODE                       25
PREFERRED_LANGUAGE                 0
GENDER                             0
FAVOURITE_BRAND               125999
MARITAL_STATUS                     0
CHILDREN_COUNT                     0
SIGN_UP_DATE                       0
B

In [56]:
# drop columns containing nans
df = df.dropna(axis=1)
df.sort_values("ORDER_ID").head()

Unnamed: 0,ORDER_ID,CUSTOMER_ID,TRUCK_ID,LOCATION_ID,SHIFT_ID,SHIFT_START_TIME,SHIFT_END_TIME,ORDER_TS,ORDER_CURRENCY,ORDER_AMOUNT,...,SIGN_UP_DATE,BIRTHDAY_DATE,E_MAIL,PHONE_NUMBER,ORDER_DETAIL_ID,MENU_ITEM_ID,LINE_NUMBER,QUANTITY,UNIT_PRICE,PRICE
67589,4063760,132433,16,14837.0,21202,08:00:00,14:00:00,2022-11-01 08:04:29,USD,7.0,...,2020-01-30,1952-01-23,Parker.Hansen@ymail.com,700-864-8862,11110334,13,0,1,7.0,7.0
67590,4063773,24203,16,14837.0,21202,08:00:00,14:00:00,2022-11-01 08:28:55,USD,31.0,...,2019-03-29,2000-08-23,Aarav.Maxwell@gmail.com,848-855-5772,11110369,18,0,1,5.0,5.0
67591,4063773,24203,16,14837.0,21202,08:00:00,14:00:00,2022-11-01 08:28:55,USD,31.0,...,2019-03-29,2000-08-23,Aarav.Maxwell@gmail.com,848-855-5772,11110370,19,1,2,3.0,6.0
67592,4063773,24203,16,14837.0,21202,08:00:00,14:00:00,2022-11-01 08:28:55,USD,31.0,...,2019-03-29,2000-08-23,Aarav.Maxwell@gmail.com,848-855-5772,11110371,12,2,3,6.0,18.0
67593,4063773,24203,16,14837.0,21202,08:00:00,14:00:00,2022-11-01 08:28:55,USD,31.0,...,2019-03-29,2000-08-23,Aarav.Maxwell@gmail.com,848-855-5772,11110372,14,3,1,2.0,2.0


In [57]:
df["MENU_ITEM_ID"].nunique()

100

In [58]:
df["ORDER_ID"].nunique()

46300

In [59]:
mybasket = df.groupby(["ORDER_ID", "MENU_ITEM_ID"])["QUANTITY"].sum().unstack().reset_index().fillna(0).set_index("ORDER_ID")

# sort values to roughly check values
mybasket.sort_values("ORDER_ID").head()

MENU_ITEM_ID,10,11,12,13,14,15,16,17,18,19,...,143,144,145,146,151,152,153,154,155,156
ORDER_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4063760,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4063773,0.0,0.0,3.0,0.0,1.0,0.0,0.0,0.0,1.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4063800,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4063819,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4063823,1.0,0.0,1.0,1.0,2.0,1.0,0.0,1.0,1.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [60]:
# converting all positive values to 1 and everything else to 0
def my_encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1
    
my_basket_sets = mybasket.applymap(my_encode_units)

### Training Model

In [61]:
# generating frequent itemsets
my_frequent_itemsets = apriori(my_basket_sets, min_support=0.025, use_colnames=True)



In [62]:
my_frequent_itemsets.head(90)

Unnamed: 0,support,itemsets
0,0.028812,(11)
1,0.028639,(12)
2,0.027603,(13)
3,0.027322,(17)
4,0.026566,(18)
...,...,...
85,0.027927,"(141, 143)"
86,0.027927,"(142, 143)"
87,0.029482,"(152, 151)"
88,0.028790,"(153, 151)"


In [63]:
# generating rules
my_rules = association_rules(my_frequent_itemsets, metric="lift", min_threshold=14)

In [64]:
my_rules.shape

(8, 10)

In [65]:
my_rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(72),(71),0.043737,0.043801,0.029028,0.663704,15.152604,0.027112,2.843322,0.976723
1,(71),(72),0.043801,0.043737,0.029028,0.662722,15.152604,0.027112,2.835237,0.976789
2,(101),(102),0.042311,0.042009,0.026976,0.63757,15.177121,0.025199,2.643247,0.975381
3,(102),(101),0.042009,0.042311,0.026976,0.642159,15.177121,0.025199,2.6763,0.975073
4,(102),(103),0.042009,0.043758,0.027797,0.661697,15.121696,0.025959,2.826581,0.974821
