In [1]:
import pandas as pd
import numpy as np

usersDF = pd.read_json("./data/processed/converted/users.json")
sessionsDF = pd.read_json("./data/processed/converted/sessions.json")
productsDF = pd.read_json("./data/processed/converted/products.json")
deliveriesDF = pd.read_json("./data/processed/converted/deliveries.json")


Searching for anomalies and missing data.

In [23]:
usersNan = usersDF.isna().sum()
print(f"NaN values in user data\n{usersNan}\n")

productsNan = productsDF.isna().sum()
print(f"NaN values in products data\n{productsNan}\n")

sessionsNan = sessionsDF.isna().sum()
print(f"NaN values in sessions data\n{sessionsNan}\n")

NaN values in user data
user_id    0
name       0
city       0
street     0
dtype: int64

NaN values in products data
product_id       0
product_name     0
category_path    0
price            0
user_rating      0
dtype: int64

NaN values in sessions data
session_id               0
timestamp                0
user_id                  0
product_id               0
event_type               0
offered_discount         0
purchase_id         114253
dtype: int64



In [24]:
mergedDF = pd.merge(sessionsDF, usersDF, on="user_id")
mergedDF = pd.merge(mergedDF, productsDF, on="product_id")

mergedNan = mergedDF.isna().sum()
print(f"NaN values in merged data\n{mergedNan}\n")

NaN values in merged data
session_id               0
timestamp                0
user_id                  0
product_id               0
event_type               0
offered_discount         0
purchase_id         114253
name                     0
city                     0
street                   0
product_name             0
category_path            0
price                    0
user_rating              0
dtype: int64



In [3]:
eventGrSessionsDF = sessionsDF.groupby(["event_type"])
eventGrSessionsDF.head()

Unnamed: 0,session_id,timestamp,user_id,product_id,event_type,offered_discount,purchase_id
0,124,2021-05-13 06:07:09,102,1317,VIEW_PRODUCT,0,
1,125,2021-09-11 01:25:08,102,1170,VIEW_PRODUCT,0,
2,125,2021-09-11 01:29:38,102,1055,VIEW_PRODUCT,0,
3,125,2021-09-11 01:30:44,102,1053,VIEW_PRODUCT,0,
4,125,2021-09-11 01:33:50,102,1060,VIEW_PRODUCT,0,
12,125,2021-09-11 01:59:16,102,1011,BUY_PRODUCT,0,20001.0
31,131,2021-03-18 00:26:34,102,1011,BUY_PRODUCT,0,20002.0
36,132,2021-07-16 10:45:10,102,1011,BUY_PRODUCT,0,20003.0
38,133,2021-04-25 08:58:28,102,1173,BUY_PRODUCT,0,20004.0
49,134,2021-09-28 01:14:36,102,1013,BUY_PRODUCT,10,20005.0


In [41]:
statsDF = mergedDF.groupby(pd.Grouper(key="timestamp", axis=0, freq="1D", sort=True))
productsViewStats = statsDF["product_id"].count().describe()
print(f"Products view stats\n{productsViewStats}")

productsBuyStats = statsDF["purchase_id"].count().describe()
print(f"Purchase stats\n{productsBuyStats}")


Products view stats
count    300.000000
mean     417.153333
std       62.835320
min      233.000000
25%      372.000000
50%      414.500000
75%      456.250000
max      632.000000
Name: product_id, dtype: float64
Purchase stats
count    300.000000
mean      36.310000
std        6.030072
min       23.000000
25%       32.000000
50%       36.000000
75%       40.000000
max       59.000000
Name: purchase_id, dtype: float64


In [4]:
groupedSessionsDF = sessionsDF.groupby(["session_id"])
groupedSessionsDF.head()

Unnamed: 0,session_id,timestamp,user_id,product_id,event_type,offered_discount,purchase_id
0,124,2021-05-13 06:07:09,102,1317,VIEW_PRODUCT,0,
1,125,2021-09-11 01:25:08,102,1170,VIEW_PRODUCT,0,
2,125,2021-09-11 01:29:38,102,1055,VIEW_PRODUCT,0,
3,125,2021-09-11 01:30:44,102,1053,VIEW_PRODUCT,0,
4,125,2021-09-11 01:33:50,102,1060,VIEW_PRODUCT,0,
...,...,...,...,...,...,...,...
125139,21382,2021-06-15 13:46:43,301,1061,VIEW_PRODUCT,15,
125140,21382,2021-06-15 13:48:25,301,1050,VIEW_PRODUCT,15,
125141,21382,2021-06-15 13:52:46,301,1054,VIEW_PRODUCT,15,
125142,21382,2021-06-15 13:53:42,301,1268,VIEW_PRODUCT,15,
