In [84]:
import pandas as pd
import joblib
import pm4py
import numpy as np
import re 

In [85]:
data = joblib.load("../../data/processed/data_preprocessed.pkl")

In [86]:
data["event"] = np.where(data["activityType"] == "ECOMMERCE", data["actionType"], 
                         np.where((data["activityType"] == "EVENT") & (data["eventCategory"] == "video youtube"),  "video_" + data["eventAction"],
                                  np.where((data["activityType"] == "EVENT") & (data["eventCategory"] == "internal search"),  "INTERNAL_SEARCH",
                                           np.where(data["activityType"] == "EVENT", data["eventAction"], "LOAD_NEW_PAGE"))))
data["event"] = data["event"].str.rsplit(" - ", expand=True)[0] #delete seconds after the video event (all product videos)
data.loc[data["event"].str.contains("filter by",  case = False), "event"] = "FILTER_BY" #group filter events
data.loc[data["event"].str.contains("remov", case = False), "event"] = "REMOVE_FROM_CART" #group remove events
data.loc[data["event"].str.contains("compare", case = False), "event"] = "COMPARE_PRODUCTS" #group compare events
data.loc[data["event"].str.contains("add", case = False), "event"] = "ADD_TO_CART" #group add events
data.loc[data["event"].str.contains("download", case = False), "event"] = "DOWNLOAD" #group download events



In [87]:
page_categories = {
        "SERVICE" : ["retour-aanvragen", "service-verzoekformulier", "customer-service", "services", "service", 
                "webshop-retouraanvraag", "contacts", "contact", "thank-you-retour"],

        "LEGAL" : ["verkoopsvoorwaarden-algemeen", "algemene-verkoopsvoorwaarden", "faq", "privacy-policy", 
                "algemene-verkoopsvoorwaarden-particulier", "cookie-policy", "conditions-de-vente", 
                "conditions-de-vente-generales", "conditions-de-vente-generales-entreprise",
                "algemene-verkoopsvoorwaarden-aankoop-als-onderneming", 
                "conditions-de-vente-generales-consommateurs"],

        "PROMOTION" : ["promoties", "promotions", "bean-to-cup-gratis-cadeaubox", "lotus"],

        "COMPARISON" : ["producten-vergelijken", "comparaison-de-produits"],

        "COMPANY" : ["het-bedrijf", "entreprise", "Het-bedrijf"],

        "NEWS" : ["nieuws", "actualites", "newsletter"],

        "SEARCH" : ["zoekfunctie", "search"],

        "ACCOUNT" : ["activate", "my-account", "register", "forgot-password"],

        "JOBS" : ["jobs_product-manager", "jobs-chauffeur", "jobs_customer_service_medewerker", 
                "jobs-key-account-manager", "jobs-account-manager"],

        "SHOWROOM" : ["showroom-bezoek", "store-locator", "stores", "bezoek-showroom", 
                "showroom-visiteurs", "inplannen-showroom-bezoek"],

        "INFO" : ["info", "3dbestanden", "smeg-connect", "livrerecettesfourvapeur", 
                "app-leefkeuken", "stoomkookboek", "smegconnect", "gebruiksaanwijzingen", 
                "telecharger-les-catalogues"],

        "CHECKOUT" : ["bedankt", "merci", "wishlist", "cart", "checkout"],

        "INSPIRATION" : ["inspiratie-artikels", "Designlijnen", "inspiratie", "designlijnen", 
                "inspiration", "inspirations", "vitality", "designlijn", 
                "designs", "design", "samenwerkingen", "collaborations"],

        "ERROR" : ["pagenotfound"], 

        "HOME" : ["/nl", "/fr", "/be-fr", "/be-nl"]
}


data["page"] = data["pagePath"].str.rsplit("/", expand=True)[2] 
data["pageExtra"] = data["pagePath"].str.rsplit("/", expand=True)[3] 
data["page"] = data["page"].str.rsplit("?", expand=True)[0] #delete parameters search
data["page"] = np.where(data["page"].isin(["service", "services"]) & 
                        data["pageExtra"].isin(["gebruiksaanwijzingen", "telecharger-les-catalogues"]), 
                         data["pageExtra"], data["page"])
data["page"] = np.where(data["pagePath"].isin(["/nl", "/fr", "/be-nl", "/be-fr"]), data["pagePath"], data["page"])

data["page_new"] = np.nan
for key, values in page_categories.items(): 
      data.loc[data["page"].isin(values), "page_new"] = key
      data.loc[(data["page"].notna() & data["page_new"].isna()), "page_new"] = "PRODUCT"
            

In [88]:
data.loc[data["page"].notna(), "event"] = data.loc[data["page"].notna(), "event"] + "_" + data.loc[data["page"].notna(), "page_new"]

In [89]:
#data.loc[data["event"].str.contains("click", case = False), "event"] = "CLICK" #group click events
data.loc[data["event"].str.contains("product\\b", case = False), "event"] = "LOAD_NEW_PAGE_PRODUCT" #group product pages events
data.loc[data["event"].str.contains("search", case = False), "event"] = "LOAD_NEW_PAGE_SEARCH" #group search events
data.loc[data["event"].str.contains("promotion", case = False), "event"] = "LOAD_NEW_PAGE_PROMOTION" #group promotion events
data.loc[data["event"].str.contains("step", case = False), "event"] = "LOAD_NEW_PAGE_CHECKOUT" #group search events
data.loc[data["event"].str.contains("phone", case = False), "event"] = "LOAD_NEW_PAGE_SERVICE" #group contact events
data.loc[data["event"].str.contains("order", case = False), "event"] = "LOAD_NEW_PAGE_CHECKOUT" #group checkout events
data.loc[data["event"].str.contains("login|account", case = False), "event"] = "LOAD_NEW_PAGE_ACCOUNT" #group checkout events
data.loc[data["event"].str.contains("compare", case = False), "event"] = "LOAD_NEW_PAGE_COMPARISON" #group comparison events

In [90]:
data["event"] = data["event"].str.upper()

In [91]:
data_clean = data.loc[:, ["id", "sessionId", "activityTime", "event", "pagePath", "productSku", "channelGrouping", "deviceCategory", "platform"]].sort_values(["id", "sessionId", "activityTime", "event"], ascending=True)

In [92]:
data_clean["lagged_event"] = data_clean.groupby(["id", "sessionId"])["event"].shift(1)
data_clean["lagged_page"] = data_clean.groupby(["id", "sessionId"])["pagePath"].shift(1)

In [93]:
data_clean["duplicate"] = np.where((data_clean["event"] == data_clean["lagged_event"]) & 
                                   (data_clean["pagePath"] == data_clean["lagged_page"]), 1, 0)

In [94]:
print(data_clean["duplicate"].value_counts())

0    86596
1     7572
Name: duplicate, dtype: int64


In [95]:
data_clean = data_clean.loc[data_clean["duplicate"] == 0, ["id", "sessionId", "activityTime", "event", "productSku", "channelGrouping", "deviceCategory", "platform"]]

In [96]:
data_clean["activityTime"] = pd.to_datetime(data_clean["activityTime"])

In [97]:
#cutoff_s = 30
#cutoff_events = 1
#data_to_delete = data_clean.groupby(["id", "sessionId"]).apply(lambda x: (x["activityTime"].max() - x["activityTime"].min()).total_seconds()).reset_index()
#data_to_delete2 = data_clean.groupby(["id", "sessionId"]).size().reset_index()

#data_to_delete = data_to_delete.merge(data_to_delete2, how="left", on=["id", "sessionId"])
#data_to_delete.columns = ["id", "sessionId", "n_seconds", "n_events"]
#data_to_delete = data_to_delete.loc[(data_to_delete["n_seconds"] <= cutoff_s) | (data_to_delete["n_events"] <= cutoff_events), :]

In [98]:
#data_clean = data_clean.loc[~np.array(data_clean["id"].isin(data_to_delete["id"]) | data_clean["sessionId"].isin(data_to_delete["sessionId"])), :]

In [99]:
data_clean.to_pickle("../../data/processed/data_clean.pkl")