In [11]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error

In [2]:
products = pd.read_csv("data/products.csv")
store_purchases = pd.read_csv("data/store_purchases.csv")
users = pd.read_csv("data/users.csv")
web_visits = pd.read_csv("data/web_visits.csv")

In [3]:
store_user = store_purchases.merge(users, how="left", left_on = ["user_id"], right_on = ["user_ID"])
store_user.drop(["user_ID"], inplace=True, axis=1)
store_user

Unnamed: 0,user_id,date,family,class,type,product,product_id,price,card_owner
0,49263,2020-02-26,family_6,class_3,type_4,product_3,00043a7a-ba01-4653-a8c9-dfc1c256372d,5.0,False
1,30940,2022-12-16,family_6,class_3,type_4,product_3,00043a7a-ba01-4653-a8c9-dfc1c256372d,5.0,False
2,11386,2020-02-18,family_6,class_3,type_4,product_3,00043a7a-ba01-4653-a8c9-dfc1c256372d,5.0,False
3,37749,2020-11-04,family_6,class_3,type_4,product_3,00043a7a-ba01-4653-a8c9-dfc1c256372d,5.0,False
4,14128,2021-02-10,family_6,class_3,type_4,product_3,00043a7a-ba01-4653-a8c9-dfc1c256372d,5.0,False
...,...,...,...,...,...,...,...,...,...
171648,34856,2021-08-08,family_6,class_1,type_1,product_12,6bcfe8b4-42c2-4089-bccc-b670ad8210d6,152.0,False
171649,47325,2022-10-21,family_6,class_1,type_1,product_12,6bcfe8b4-42c2-4089-bccc-b670ad8210d6,152.0,False
171650,30303,2022-05-12,family_6,class_1,type_1,product_12,6bcfe8b4-42c2-4089-bccc-b670ad8210d6,152.0,False
171651,20029,2021-01-17,family_6,class_1,type_1,product_12,6bcfe8b4-42c2-4089-bccc-b670ad8210d6,152.0,False


In [4]:
df = web_visits.merge(store_user[["user_id", "product_id", "date", "card_owner"]], how = "left", on=["user_id", "product_id"])
df

Unnamed: 0,family,class,type,web_visit_date,user_id,product,price,product_id,conversion,date,card_owner
0,family_9,class_8,type_2,2022-03-02,1770.0,product_17,103,567e07ef-45e3-406b-a2b1-1175187504be,0,,
1,family_4,class_8,type_2,2022-04-08,43333.0,product_30,1,5f7dcb42-5eb4-4aaf-bd36-40e766fc7449,0,,
2,family_9,class_8,type_1,2022-03-14,33025.0,product_16,2,6d2ac9e4-9baf-4338-b3d9-671bcdcb36eb,0,,
3,family_4,class_8,type_2,2022-04-25,24377.0,product_22,34,717b0355-71aa-4be6-9eff-afa5c7ec95c7,0,,
4,family_4,class_8,type_2,2022-09-19,40274.0,product_32,1,75b6a327-af04-413a-a605-a07cc66b3fb4,0,,
...,...,...,...,...,...,...,...,...,...,...,...
27238191,family_2,class_1,type_1,2020-06-17,18550.0,product_19,163,1b002490-5f32-4482-8a50-83379e511519,0,,
27238192,family_2,class_1,type_1,2020-06-13,27973.0,product_16,1,20b90b1a-beeb-4308-82ba-7c55121141d8,0,,
27238193,family_2,class_1,type_1,2022-02-07,39666.0,product_16,1,20b90b1a-beeb-4308-82ba-7c55121141d8,0,,
27238194,family_5,class_1,type_2,2022-05-09,28123.0,product_5,166,21c5eef8-2ffe-4dbf-9b9a-04f87715e6af,0,2021-12-06,False


In [5]:
df["offline_conversion"] = 0

In [7]:
df_converted = df[df["conversion"]==0]
df_converted

Unnamed: 0,family,class,type,web_visit_date,user_id,product,price,product_id,conversion,date,card_owner,offline_conversion
0,family_9,class_8,type_2,2022-03-02,1770.0,product_17,103,567e07ef-45e3-406b-a2b1-1175187504be,0,,,0
1,family_4,class_8,type_2,2022-04-08,43333.0,product_30,1,5f7dcb42-5eb4-4aaf-bd36-40e766fc7449,0,,,0
2,family_9,class_8,type_1,2022-03-14,33025.0,product_16,2,6d2ac9e4-9baf-4338-b3d9-671bcdcb36eb,0,,,0
3,family_4,class_8,type_2,2022-04-25,24377.0,product_22,34,717b0355-71aa-4be6-9eff-afa5c7ec95c7,0,,,0
4,family_4,class_8,type_2,2022-09-19,40274.0,product_32,1,75b6a327-af04-413a-a605-a07cc66b3fb4,0,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...
27238191,family_2,class_1,type_1,2020-06-17,18550.0,product_19,163,1b002490-5f32-4482-8a50-83379e511519,0,,,0
27238192,family_2,class_1,type_1,2020-06-13,27973.0,product_16,1,20b90b1a-beeb-4308-82ba-7c55121141d8,0,,,0
27238193,family_2,class_1,type_1,2022-02-07,39666.0,product_16,1,20b90b1a-beeb-4308-82ba-7c55121141d8,0,,,0
27238194,family_5,class_1,type_2,2022-05-09,28123.0,product_5,166,21c5eef8-2ffe-4dbf-9b9a-04f87715e6af,0,2021-12-06,False,0


In [9]:
df_converted["offline_conversion"] = df_converted["offline_conversion"].where(df_converted["date"].isna(), 1)
df_converted

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_converted["offline_conversion"] = df_converted["offline_conversion"].where(df_converted["date"].isna(), 1)


Unnamed: 0,family,class,type,web_visit_date,user_id,product,price,product_id,conversion,date,card_owner,offline_conversion
0,family_9,class_8,type_2,2022-03-02,1770.0,product_17,103,567e07ef-45e3-406b-a2b1-1175187504be,0,,,0
1,family_4,class_8,type_2,2022-04-08,43333.0,product_30,1,5f7dcb42-5eb4-4aaf-bd36-40e766fc7449,0,,,0
2,family_9,class_8,type_1,2022-03-14,33025.0,product_16,2,6d2ac9e4-9baf-4338-b3d9-671bcdcb36eb,0,,,0
3,family_4,class_8,type_2,2022-04-25,24377.0,product_22,34,717b0355-71aa-4be6-9eff-afa5c7ec95c7,0,,,0
4,family_4,class_8,type_2,2022-09-19,40274.0,product_32,1,75b6a327-af04-413a-a605-a07cc66b3fb4,0,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...
27238191,family_2,class_1,type_1,2020-06-17,18550.0,product_19,163,1b002490-5f32-4482-8a50-83379e511519,0,,,0
27238192,family_2,class_1,type_1,2020-06-13,27973.0,product_16,1,20b90b1a-beeb-4308-82ba-7c55121141d8,0,,,0
27238193,family_2,class_1,type_1,2022-02-07,39666.0,product_16,1,20b90b1a-beeb-4308-82ba-7c55121141d8,0,,,0
27238194,family_5,class_1,type_2,2022-05-09,28123.0,product_5,166,21c5eef8-2ffe-4dbf-9b9a-04f87715e6af,0,2021-12-06,False,1


In [15]:
df_converted["web_visit_date"] = pd.to_datetime(df_converted["web_visit_date"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_converted["web_visit_date"] = pd.to_datetime(df_converted["web_visit_date"])


In [16]:
df_converted["date"] = pd.to_datetime(df_converted["date"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_converted["date"] = pd.to_datetime(df_converted["date"])


In [None]:
df_converted["difference"] = 

In [12]:
data = df_converted.copy()

In [13]:
X = data.drop('offline_conversion', axis=1)
y = data['offline_conversion']

In [14]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Step 4: XGBoost Model Initialization
model = xgb.XGBClassifier(n_estimators=100, tree_method="gpu_hist",  max_depth=3, learning_rate=0.1, random_state=42)

# Step 6: Model Training
model.fit(X_train, y_train)

# Step 7: Model Evaluation
val_predictions = model.predict(X_val)
accuracy = accuracy_score(y_val, val_predictions)
print(f'Validation Accuracy: {accuracy}')

# Step 10: Final Model Evaluation
test_predictions = model.predict(X_test)
test_accuracy = accuracy_score(y_test, test_predictions)
print(f'Test Accuracy: {test_accuracy}')

  or is_sparse(dtype)
  or (is_categorical_dtype(dtype) and enable_categorical)


ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, The experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:family: object, class: object, type: object, web_visit_date: object, product: object, product_id: object, date: object, card_owner: object

In [None]:
from matplotlib import pyplot as plt
feature_importance = model.feature_importances_
sorted_idx = np.argsort(feature_importance)
fig = plt.figure(figsize=(12, 6))
plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx], align='center')
plt.yticks(range(len(sorted_idx)), np.array(X_test.columns)[sorted_idx])
plt.title('Feature Importance')