In [22]:
import pandas as pd
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score

In [23]:
def print_evaluation(pipeline_or_model: str, X_train, X_test, y_train, y_test, y_train_pred, y_test_pred):
    accurary_train = accuracy_score(y_train, y_train_pred)
    recall_train = recall_score(y_train, y_train_pred)
    precision_train = precision_score(y_train, y_train_pred)

    accurary_test = accuracy_score(y_test, y_test_pred)
    recall_test = recall_score(y_test, y_test_pred)
    precision_test = precision_score(y_test, y_test_pred)
    
    print(
        f"{pipeline_or_model} Evaluation:\n"
        f"{'':6} {'ACC':>10} | {'REC':>14} | {'PRE':>10} | {'rows':>8} | {'columns':>8}\n"
        f"{'Train':6} {accurary_train:10.5f} | {recall_train:14.5f} | {precision_train:10.5f} | {X_train.shape[0]:8} | {X_train.shape[1]:8}\n"
        f"{'Test':6} {accurary_test:10.5f} | {recall_test:14.5f} | {precision_test:10.5f} | {X_test.shape[0]:8} | {X_test.shape[1]:8}\n"
    )

In [41]:
df = pd.read_parquet("DSCB310 - UE2 - Shopping Carts.parquet")

In [42]:
df = df.reset_index(drop= True)
df = df.fillna(0)

In [43]:
X = df[["order_id",	"product_id",	"add_to_cart_order",	"user_id",	"order_number",	"order_dow",	"order_hour_of_day",	"days_since_prior_order",	"aisle_id",	"department_id"]]
y = df["tip"]

In [65]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [21]:
gs_crossv = GridSearchCV(
 estimator= DecisionTreeClassifier(),
 param_grid= dict(min_samples_split= [2, 10, 20, 50, 100], max_depth= [1, 3, 6, 9], random_state= [0]),
 scoring = "accuracy",
 n_jobs= -1,
 cv= 5
).fit(X_train, y_train)

print(f"Best parameters: {gs_crossv.best_params_}, with score: {gs_crossv.best_score_}")

Best parameters: {'max_depth': 9, 'min_samples_split': 10, 'random_state': 0}, with score: 0.6197656607492457


In [24]:
dtc = DecisionTreeClassifier(min_samples_split= 10, max_depth= 9, random_state= 0).fit(X_train, y_train)

y_pred_train_best_dtc = dtc.predict(X_train)
y_pred_test_best_dtc = dtc.predict(X_test)

print_evaluation("DTC", X_train, X_test, y_train, y_test, y_pred_train_best_dtc, y_pred_test_best_dtc)

DTC Evaluation:
              ACC |            REC |        PRE |     rows |  columns
Train     0.62051 |        0.46649 |    0.59761 |  4906135 |       10
Test      0.61896 |        0.46512 |    0.59375 |  1226534 |       10



In [25]:
print(dict(zip(X.columns, dtc.feature_importances_)))

{'order_id': 0.018367372509618257, 'product_id': 4.5696116588217275e-05, 'add_to_cart_order': 0.020308801118412365, 'user_id': 0.019493335229683193, 'order_number': 0.15710477203972076, 'order_dow': 0.274314290603983, 'order_hour_of_day': 0.12237543752543767, 'days_since_prior_order': 0.33703835980131674, 'aisle_id': 0.0010287656565209013, 'department_id': 0.04992316939871884}


### **Vorhersage tip der aktuellen Bestellung**

In [67]:
df_userhh = df.copy()
df_userhh = df_userhh.sort_values(by= ["user_id", "order_number"], ascending= True)
df_userh1 = df_userhh.groupby("user_id", as_index= False).agg(orders = ("order_id", "unique"), reordered= ("reordered", "mean"), tip= ("tip", "mean"), order_number= ("order_number", "unique"))

last_orders = []
for ind, order in enumerate(df_userh1.orders):
 last_orders.append(df_userh1.orders[ind][-1])

df_userhh.drop(df_userhh.loc[df_userhh.order_id.isin(last_orders)].index, inplace= True)
df_userh = df_userhh.groupby("user_id", as_index= False).agg(orders = ("order_id", "nunique"), reordered= ("reordered", "mean"), tip= ("tip", "mean"))
df_last_order = df.loc[df.order_id.isin(last_orders)].groupby("user_id").agg(tip_last_order= ("tip", "max"), last_order_id = ("order_id", "max")).reset_index()
#df_userh = df_userh.merge(df_last_order, how= "right", on= "user_id")

In [94]:
df1 = df.copy()

In [95]:
df_last_orders = df1.loc[df1.order_id.isin(last_orders)]

In [96]:
df1.drop(df1.loc[df1.order_id.isin(last_orders)].index, inplace= True)

In [97]:
df_tip = df1.groupby(["user_id"], as_index= False).agg(user_tip_ratio = ("tip", "mean"))

In [98]:
df1 = df1.merge(df_tip, on= "user_id", how= "right")

In [114]:
df_last_orders = df_last_orders.merge(df_tip, on= "user_id", how= "right")

In [109]:
X = df1[["order_id",	"product_id",	"add_to_cart_order",	"user_id",	"order_number",	"order_dow",	"order_hour_of_day",	"days_since_prior_order",	"aisle_id",	"department_id", "user_tip_ratio"]]
y = df1["tip"]

In [110]:
X_train, y_train = X , y

In [None]:
X_test, y_test = df_last_orders[["order_id",	"product_id",	"add_to_cart_order",	"user_id",	"order_number",	"order_dow",	"order_hour_of_day",	"days_since_prior_order",	"aisle_id",	"department_id", "user_tip_ratio"]], df_last_orders["tip"]

In [106]:
gs_crossv = GridSearchCV(
 estimator= DecisionTreeClassifier(),
 param_grid= dict(min_samples_split= [2, 10, 20, 50, 100], max_depth= [1, 3, 6, 9], random_state= [0]),
 scoring = "accuracy",
 n_jobs= -1,
 cv= 5
).fit(X_train, y_train)

print(f"Best parameters: {gs_crossv.best_params_}, with score: {gs_crossv.best_score_}")

Best parameters: {'max_depth': 6, 'min_samples_split': 2, 'random_state': 0}, with score: 0.7724332017517882


In [116]:
dtc2 = DecisionTreeClassifier(min_samples_split= 2, max_depth= 6, random_state= 0).fit(X_train, y_train)

y_pred_train_best_dtc2 = dtc2.predict(X_train)
y_pred_test_best_dtc2 = dtc2.predict(X_test)

print_evaluation("DTC", X_train, X_test, y_train, y_test, y_pred_train_best_dtc2, y_pred_test_best_dtc2)

Feature names seen at fit time, yet now missing:
- user_tip_ratio



ValueError: X has 10 features, but DecisionTreeClassifier is expecting 11 features as input.

In [113]:
print(dict(zip(X.columns, dtc2.feature_importances_)))

{'order_id': 6.701943163374691e-05, 'product_id': 0.0, 'add_to_cart_order': 6.541085648993934e-06, 'user_id': 0.0, 'order_number': 0.0, 'order_dow': 0.04376369120058702, 'order_hour_of_day': 0.018309682286576402, 'days_since_prior_order': 0.0, 'aisle_id': 0.0, 'department_id': 0.0, 'user_tip_ratio': 0.9378530659955538}
