In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.ensemble import HistGradientBoostingClassifier

In [None]:
def print_evaluation(pipeline_or_model: str, X_train, X_test, y_train, y_test, y_train_pred, y_test_pred):
    accurary_train = accuracy_score(y_train, y_train_pred)
    recall_train = recall_score(y_train, y_train_pred)
    precision_train = precision_score(y_train, y_train_pred)

    accurary_test = accuracy_score(y_test, y_test_pred)
    recall_test = recall_score(y_test, y_test_pred)
    precision_test = precision_score(y_test, y_test_pred)
    
    print(
        f"{pipeline_or_model} Evaluation:\n"
        f"{'':6} {'ACC':>10} | {'REC':>14} | {'PRE':>10} | {'rows':>8} | {'columns':>8}\n"
        f"{'Train':6} {accurary_train:10.5f} | {recall_train:14.5f} | {precision_train:10.5f} | {X_train.shape[0]:8} | {X_train.shape[1]:8}\n"
        f"{'Test':6} {accurary_test:10.5f} | {recall_test:14.5f} | {precision_test:10.5f} | {X_test.shape[0]:8} | {X_test.shape[1]:8}\n"
    )

In [None]:
df = pd.read_parquet("DSCB310 - UE2 - Shopping Carts.parquet")

In [None]:
df = df.reset_index(drop= True)
df = df.fillna(0)

In [None]:
df_tip = df.groupby(["user_id", "order_id", "order_number"], as_index= False).agg(tip = ("tip", "mean")).sort_values(by= ["user_id", "order_number"], ascending= True)
df_tip["tip_cumsum"] = df_tip.groupby("user_id", as_index= False)["tip"].cumsum()
df_tip["user_tip_ratio"] = df_tip.tip_cumsum / df_tip.order_number

df_tip_merge = df_tip[["user_id", "order_id", "order_number", "user_tip_ratio"]]
df = df.merge(df_tip_merge, on = ["user_id", "order_id", "order_number"], how= "right")

In [None]:
df_classifier = df.copy()

In [None]:
df_classifier = pd.get_dummies(df_classifier, columns= ["county"])

In [None]:
df_classifier

In [None]:
X = df_classifier.drop(["product_name", "department", "aisle", "tip"], axis= 1)
y = df_classifier["tip"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state=0)

In [None]:
gs_crss = GridSearchCV(
 estimator= HistGradientBoostingClassifier(),
 param_grid= dict(max_iter= [100], learning_rate= [0.001, 0.01, 0.1], max_depth= [1,3,5,9], random_state= [1]),
 scoring= "accuracy",
 n_jobs= -1
).fit(X_train, y_train)

print(f"Best param: {gs_crss.best_params_} with acc: {gs_crss.best_score_}")

In [None]:
HistGradientBoostingClassifier()