In [14]:
# IMPORT STATEMENTS

#Import Python packages
import pandas as pd
import json
import matplotlib.pyplot as plt
import json

# for splitting of the dataset
from sklearn.model_selection import train_test_split

#Building clustering model
from sklearn.cluster import KMeans 

# Import Snowflake modules
from snowflake.snowpark import Session
import snowflake.snowpark.functions as F
import snowflake.snowpark.types as T
from snowflake.snowpark import Window
# Define a simple scoring function
from cachetools import cached

In [15]:
# Get account credentials from a json file
with open("data_scientist_auth.json") as f:
    data = json.load(f)
    username = data["username"]
    password = data["password"]
    account = data["account"]

# Specify connection parameters
connection_parameters = {
    "account": account,
    "user": username,
    "password": password,
    "role": "TASTY_BI",
    "warehouse": "TASTY_BI_WH",
    "database": "frostbyte_tasty_bytes",
    "schema": "analytics",
}

# Create Snowpark session
session = Session.builder.configs(connection_parameters).create()

In [16]:
# retrieve order details usa table from snowflake
data_from_snowflake = session.table("frostbyte_tasty_bytes.analytics.ORDER_DETAILS_USA_MATCHED")

In [17]:
# convert data_from_snowflake to pandas dataframe
df = data_from_snowflake.to_pandas()

In [18]:
# preview data
df.head()

Unnamed: 0,ORDER_ID,CUSTOMER_ID,TRUCK_ID,LOCATION_ID,SHIFT_ID,SHIFT_START_TIME,SHIFT_END_TIME,ORDER_CHANNEL,ORDER_TS,SERVED_TS,...,E_MAIL,PHONE_NUMBER,ORDER_DETAIL_ID,MENU_ITEM_ID,DISCOUNT_ID,LINE_NUMBER,QUANTITY,UNIT_PRICE,PRICE,ORDER_ITEM_DISCOUNT_AMOUNT
0,452165023,110872,17,14954.0,200544059,15:00:00,22:00:00,,2022-07-30 17:34:42,,...,Jase.Haynes@aol.com,861-138-0409,884494568,21,,0,3,14.0,42.0,
1,452165023,110872,17,14954.0,200544059,15:00:00,22:00:00,,2022-07-30 17:34:42,,...,Jase.Haynes@aol.com,861-138-0409,884494569,23,,1,1,12.0,12.0,
2,452165023,110872,17,14954.0,200544059,15:00:00,22:00:00,,2022-07-30 17:34:42,,...,Jase.Haynes@aol.com,861-138-0409,884494570,27,,2,1,6.0,6.0,
3,452165023,110872,17,14954.0,200544059,15:00:00,22:00:00,,2022-07-30 17:34:42,,...,Jase.Haynes@aol.com,861-138-0409,884494571,28,,3,1,21.0,21.0,
4,452165023,110872,17,14954.0,200544059,15:00:00,22:00:00,,2022-07-30 17:34:42,,...,Jase.Haynes@aol.com,861-138-0409,884494572,29,,4,1,6.0,6.0,


In [23]:
df.isnull().sum()

ORDER_ID              0
CUSTOMER_ID           0
TRUCK_ID              0
LOCATION_ID           0
SHIFT_ID              0
SHIFT_START_TIME      0
SHIFT_END_TIME        0
ORDER_TS              0
ORDER_CURRENCY        0
ORDER_AMOUNT          0
ORDER_TOTAL           0
MAX_ORDER_TS          0
FIRST_NAME            0
LAST_NAME             0
CITY                  0
COUNTRY               0
PREFERRED_LANGUAGE    0
GENDER                0
MARITAL_STATUS        0
CHILDREN_COUNT        0
SIGN_UP_DATE          0
BIRTHDAY_DATE         0
E_MAIL                0
PHONE_NUMBER          0
ORDER_DETAIL_ID       0
MENU_ITEM_ID          0
LINE_NUMBER           0
QUANTITY              0
UNIT_PRICE            0
PRICE                 0
dtype: int64

In [29]:
# drop columns containing nans
df = df.dropna(axis=1)
df.sort_values("ORDER_ID").head()

Unnamed: 0,ORDER_ID,CUSTOMER_ID,TRUCK_ID,LOCATION_ID,SHIFT_ID,SHIFT_START_TIME,SHIFT_END_TIME,ORDER_TS,ORDER_CURRENCY,ORDER_AMOUNT,...,SIGN_UP_DATE,BIRTHDAY_DATE,E_MAIL,PHONE_NUMBER,ORDER_DETAIL_ID,MENU_ITEM_ID,LINE_NUMBER,QUANTITY,UNIT_PRICE,PRICE
98433,4063760,132433,16,14837.0,21202,08:00:00,14:00:00,2022-11-01 08:04:29,USD,7.0,...,2020-01-30,1952-01-23,Parker.Hansen@ymail.com,700-864-8862,11110334,13,0,1,7.0,7.0
98434,4063773,24203,16,14837.0,21202,08:00:00,14:00:00,2022-11-01 08:28:55,USD,31.0,...,2019-03-29,2000-08-23,Aarav.Maxwell@gmail.com,848-855-5772,11110369,18,0,1,5.0,5.0
98435,4063773,24203,16,14837.0,21202,08:00:00,14:00:00,2022-11-01 08:28:55,USD,31.0,...,2019-03-29,2000-08-23,Aarav.Maxwell@gmail.com,848-855-5772,11110370,19,1,2,3.0,6.0
98436,4063773,24203,16,14837.0,21202,08:00:00,14:00:00,2022-11-01 08:28:55,USD,31.0,...,2019-03-29,2000-08-23,Aarav.Maxwell@gmail.com,848-855-5772,11110371,12,2,3,6.0,18.0
98437,4063773,24203,16,14837.0,21202,08:00:00,14:00:00,2022-11-01 08:28:55,USD,31.0,...,2019-03-29,2000-08-23,Aarav.Maxwell@gmail.com,848-855-5772,11110372,14,3,1,2.0,2.0


In [28]:
mybasket = df.groupby(["ORDER_ID", "MENU_ITEM_ID"])["QUANTITY"].sum().unstack().reset_index().fillna(0).set_index("ORDER_ID")

# sort values to roughly check values
mybasket.sort_values("ORDER_ID").head()

MENU_ITEM_ID,10,11,12,13,14,15,16,17,18,19,...,143,144,145,146,151,152,153,154,155,156
ORDER_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4063760,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4063773,0.0,0.0,3.0,0.0,1.0,0.0,0.0,0.0,1.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4063800,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4063819,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4063823,1.0,0.0,1.0,1.0,2.0,1.0,0.0,1.0,1.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
