![download.png](download.png)


## Import Packages

In [2]:
#Import Python packages
import pandas as pd
import plotly.express as px
import json
import sys
import cachetools
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import numpy as np
import json
import datetime

# for splitting of the dataset
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate

# for encoding/transforming the data
from feature_engine.encoding import OneHotEncoder

#ML-Models
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier

# for getting metrics
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

# model selection
from sklearn.model_selection import GridSearchCV

# Import Snowflake modules
from snowflake.snowpark import Session
import snowflake.snowpark.functions as F
import snowflake.snowpark.types as T
from snowflake.snowpark import Window

## Connect to Snowflake

In [3]:
# Get account credentials from a json file
with open("data_scientist_auth.json") as f:
    data = json.load(f)
    username = data["username"]
    password = data["password"]
    account = data["account"]

# Specify connection parameters
connection_parameters = {
    "account": account,
    "user": username,
    "password": password,
    "role": "TASTY_BI",
    "warehouse": "TASTY_BI_WH",
    "database": "frostbyte_tasty_bytes",
    "schema": "analytics",
}

# Create Snowpark session
session = Session.builder.configs(connection_parameters).create()

## Gathering Data as Snowpark DataFrame

In [4]:
#Orders_V
tasty_bytes_orders_v = session.table("frostbyte_tasty_bytes.analytics.orders_v")
#Customer Loyalty
tasty_bytes_customer_loyalty = session.table("frostbyte_tasty_bytes.raw_customer.customer_loyalty")

## Preview The Data

In [28]:
#Filter to only Australia
filtered_df=tasty_bytes_orders_v.filter(F.col('COUNTRY')=='Australia')

#Merge to get loyal customers orders
loyal_customer_orders=tasty_bytes_customer_loyalty.join(
    right=filtered_df,
    on="CUSTOMER_ID",
    how="inner"
)
loyal_customer_orders.show()
print('Number of rows:',loyal_customer_orders.count())

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [6]:
#Function to remove and rename columns after inner join
def remove_dup_join_col(df):
    # capitalise all columns
    for col in df.columns:
        df = df.withColumnRenamed(col, col.upper())
    
    # get list of renamable columns for left and right join
    left_dup_col = [col_name for col_name in df.columns if col_name.startswith('L_')]
    right_dup_col = [col_name for col_name in df.columns if col_name.startswith('R_')]
    old_columns = df.columns
    
    # rename the list with the most number of renamable columns
    # drop the list of columns with less number of renamable columns
    if len(left_dup_col)>len(right_dup_col):
        columns_rename=left_dup_col
        df= df.drop(*right_dup_col)
    else:
        columns_rename=right_dup_col
        df= df.drop(*left_dup_col)
    
    for old_column in old_columns:
        if old_column in columns_rename:
            # get string to remove
            string_to_replace = columns_rename[0][:7]
            # replace starting string
            new_column = old_column.replace(string_to_replace, "")
            df = df.withColumnRenamed(old_column, new_column)
    
    return df

In [29]:
loyal_customer_orders = remove_dup_join_col(loyal_customer_orders)
loyal_customer_orders.show()

-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"CUSTOMER_ID"  |"CITY"     |"POSTAL_CODE"  |"PREFERRED_LANGUAGE"  |"FAVOURITE_BRAND"  |"SIGN_UP_DATE"  |"BIRTHDAY_DATE"  |"DATE"      |"ORDER_ID"  |"TRUCK_ID"  |"ORDER_TS"

In [96]:
#Total Quantity of each order
temp_df=loyal_customer_orders.group_by(F.col('ORDER_ID')).agg(F.sum(F.col('QUANTITY')).alias('Total_Quantity'))
#Remove duplicate order ids
remove_dup = loyal_customer_orders.dropDuplicates(['ORDER_ID'])
#Merge to get transactions table
combined_df=remove_dup.join(
    right=temp_df,
    on="ORDER_ID",
    how="inner"
)
combined_df.show()

-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"ORDER_ID"  |"CUSTOMER_ID"  |"CITY"     |"POSTAL_CODE"  |"PREFERRED_LANGUAGE"  |"FAVOURITE_BRAND"  |"SIGN_UP_DA

In [97]:
# assuming you have a DataFrame called `orders` with columns `customer_id` and `order_ts`
window_spec = Window.partitionBy('customer_id').orderBy(F.col('order_ts').desc())

orders_with_rank = combined_df.withColumn('rank', F.rank().over(window_spec))

max_order_ts = orders_with_rank.filter(F.col('rank') == 1).select('customer_id', 'order_ts').to_pandas()
max_order_ts =max_order_ts.drop_duplicates()
second_max_order_ts = orders_with_rank.filter(F.col('rank') == 2).select('customer_id', 'order_ts').to_pandas()
second_max_order_ts =second_max_order_ts.drop_duplicates()

In [120]:
second_max_order_ts=second_max_order_ts.rename(columns={'ORDER_TS':'2ndMax_Order_TS'})
max_order_ts =max_order_ts.rename(columns={'ORDER_TS':'Max_Order_TS'})

In [100]:
loyal_orders_df=combined_df.to_pandas()

In [103]:
loyal_orders_df.value_counts('CUSTOMER_ID').min()

25

In [153]:
temp_df=loyal_orders_df.merge(second_max_order_ts,on='CUSTOMER_ID',how='inner')

In [154]:
last_purchase_gone=temp_df[temp_df['ORDER_TS']<=temp_df['2ndMax_Order_TS']]

In [117]:
#Recency
recency = last_purchase_gone.groupby('CUSTOMER_ID')['ORDER_TS'].max().reset_index()
recency['RECENCY'] = (pd.Timestamp('2022-11-01 22:42:53') - recency['ORDER_TS']).dt.days

# Frequency
frequency = last_purchase_gone.groupby('CUSTOMER_ID')['ORDER_ID'].nunique().reset_index().rename(columns={'ORDER_ID': 'frequency'})

# Monetary
monetary = last_purchase_gone.groupby('CUSTOMER_ID')['ORDER_TOTAL'].sum().reset_index().rename(columns={'ORDER_TOTAL': 'monetary'})

# Merge into RFM table
RFM_table = recency.merge(right=frequency, how='inner', on='CUSTOMER_ID')
RFM_table = RFM_table.merge(right=monetary, how='inner', on='CUSTOMER_ID')

In [166]:
temp_df = last_purchase_gone.sort_values(['CUSTOMER_ID', 'ORDER_TS'])
# Calculate the time difference between consecutive 'ORDER_TS' values for each customer
temp_df['time_diff'] = temp_df.groupby('CUSTOMER_ID')['ORDER_TS'].diff().dt.days

# Calculate the mean time difference for each customer
avg_time_diff = temp_df.groupby('CUSTOMER_ID')['time_diff'].mean().reset_index().rename(columns={'time_diff': 'AVG_TIME_DIFF'})

#Max Days without Purchase
max_days=temp_df.groupby('CUSTOMER_ID')['time_diff'].max().reset_index().rename(columns={'time_diff': 'Max_Days_Without_Purchase'})

#Min Days without Purchase
min_days=temp_df.groupby('CUSTOMER_ID')['time_diff'].min().reset_index().rename(columns={'time_diff': 'Min_Days_Without_Purchase'})

In [167]:
temp_df=RFM_table.merge(right=max_order_ts,how='inner',on='CUSTOMER_ID')
temp_df=temp_df.merge(right=avg_time_diff,how='inner',on='CUSTOMER_ID')
temp_df=temp_df.merge(right=max_days,how='inner',on='CUSTOMER_ID')
temp_df=temp_df.merge(right=min_days,how='inner',on='CUSTOMER_ID')

In [168]:
temp_df

Unnamed: 0,CUSTOMER_ID,ORDER_TS,RECENCY,frequency,monetary,Max_Order_TS,AVG_TIME_DIFF,Max_Days_Without_Purchase,Min_Days_Without_Purchase
0,43121,2022-10-01 18:46:22,31,51,2118.50,2022-10-12 09:43:00,17.740000,83.0,0.0
1,43266,2022-09-28 10:34:12,34,56,2279.50,2022-10-06 21:50:40,16.690909,70.0,1.0
2,43388,2022-09-22 21:59:43,40,53,2254.50,2022-10-19 09:58:47,16.961538,74.0,0.0
3,43391,2022-09-27 21:18:00,35,46,1710.25,2022-09-29 13:40:44,20.933333,95.0,1.0
4,43411,2022-09-18 21:46:43,44,53,2188.50,2022-10-07 20:26:21,18.346154,84.0,0.0
...,...,...,...,...,...,...,...,...,...
15212,199837,2022-10-05 12:23:38,27,54,2268.25,2022-10-10 13:11:15,15.094340,72.0,0.0
15213,199841,2022-10-06 11:01:33,26,46,1891.00,2022-10-16 11:31:29,17.711111,68.0,0.0
15214,199870,2022-10-07 20:26:20,25,55,2599.50,2022-10-20 12:14:04,14.685185,95.0,0.0
15215,199892,2022-09-20 22:55:02,41,51,2447.75,2022-10-22 15:18:31,15.540000,63.0,0.0


In [169]:
temp_df['Days_To_Next_Order']=(temp_df['Max_Order_TS']-temp_df['ORDER_TS']).dt.days

In [170]:
temp_df['Days_To_Next_Order'].describe()

count    15217.000000
mean        13.051193
std         12.504497
min          0.000000
25%          4.000000
50%         10.000000
75%         18.000000
max        108.000000
Name: Days_To_Next_Order, dtype: float64

In [179]:
X_train

Unnamed: 0,CUSTOMER_ID,RECENCY,frequency,monetary,AVG_TIME_DIFF,Max_Days_Without_Purchase,Min_Days_Without_Purchase
13114,180216,55,41,1421.75,23.475000,102.0,1.0
3328,87631,16,57,2121.50,17.089286,115.0,1.0
8600,137583,11,47,1700.75,17.108696,91.0,0.0
14451,192928,1,52,1683.00,17.372549,64.0,0.0
4021,94413,25,64,2731.00,14.587302,118.0,0.0
...,...,...,...,...,...,...,...
6017,113475,22,49,1858.25,16.354167,78.0,0.0
709,56973,104,42,1511.00,21.292683,94.0,1.0
10679,157738,7,47,2064.50,20.500000,122.0,1.0
8366,135513,48,37,1756.50,17.861111,68.0,0.0


In [175]:
temp_df['NextPurchaseDayRange'] = 1
temp_df.loc[temp_df['Days_To_Next_Order']<=14,'NextPurchaseDayRange'] = 0


In [176]:
# Define Model Inputs (X) and Output (y)
X = temp_df.drop(['Days_To_Next_Order','NextPurchaseDayRange','Max_Order_TS','ORDER_TS'], axis =1)
y = temp_df["NextPurchaseDayRange"]

# Split both Inputs (X) and Ouput (y) into training set (70%) and testing set (30%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=4)

In [177]:
#Build XGBoost Model
xgb = XGBClassifier(n_estimators = 20, learning_rate = 0.1, eval_metric='logloss')
xgb.fit(X_train, y_train)

In [146]:
#Function to show MSE, MAE, r-sq of Classification models
def evaluateClassificationModelPerformance(model):
    #Accuracy
    print('Train Accuracy:',model.score(X_train, y_train),
          '\nTest Accuracy:',model.score(X_test, y_test))

In [178]:
#Evaluate XGBoost Model
evaluateClassificationModelPerformance(xgb)

Train Accuracy: 0.7796451037461272 
Test Accuracy: 0.7433201927288655


In [None]:
Train Accuracy: 0.5550652520890057 
Test Accuracy: 0.5162067455102934

Train Accuracy: 0.5673645667073515 
Test Accuracy: 0.5127025843188787

In [148]:
#Check top 10 features based on feature importance
ft_weights_xgb_clf = pd.DataFrame(xgb.feature_importances_, columns=['is_promoted'], index=X_train.columns)
ft_weights_xgb_clf.sort_values('is_promoted', ascending=False, inplace=True)
ft_weights_xgb_clf.tail(100)

Unnamed: 0,is_promoted
RECENCY,0.904882
CUSTOMER_ID,0.033462
monetary,0.032506
frequency,0.02915
