In [2]:
import pandas as pd
import numpy as np

# usersDF = pd.read_json("./data/processed/converted/users.json")
# sessionsDF = pd.read_json("./data/processed/converted/sessions.json")
# productsDF = pd.read_json("./data/processed/converted/products.json")
# deliveriesDF = pd.read_json("./data/processed/converted/deliveries.json")

usersDF = pd.read_json("./data/raw/users.jsonl", lines=True)
sessionsDF = pd.read_json("./data/raw/sessions.jsonl", lines=True)
productsDF = pd.read_json("./data/raw/products.jsonl", lines=True)
deliveriesDF = pd.read_json("./data/raw/deliveries.jsonl", lines=True)


Searching for anomalies and missing data.

In [3]:
usersNan = usersDF.isna().sum()
print(f"NaN values in user data\n{usersNan}\n")

productsNan = productsDF.isna().sum()
print(f"NaN values in products data\n{productsNan}\n")

sessionsNan = sessionsDF.isna().sum()
print(f"NaN values in sessions data\n{sessionsNan}\n")

NaN values in user data
user_id    0
name       0
city       0
street     0
dtype: int64

NaN values in products data
product_id       0
product_name     0
category_path    0
price            0
user_rating      0
dtype: int64

NaN values in sessions data
session_id               0
timestamp                0
user_id                  0
product_id               0
event_type               0
offered_discount         0
purchase_id         114253
dtype: int64



In [4]:
mergedDF = pd.merge(sessionsDF, usersDF, on="user_id")
mergedDF = pd.merge(mergedDF, productsDF, on="product_id")

mergedNan = mergedDF.isna().sum()
print(f"NaN values in merged data\n{mergedNan}\n")

NaN values in merged data
session_id               0
timestamp                0
user_id                  0
product_id               0
event_type               0
offered_discount         0
purchase_id         114253
name                     0
city                     0
street                   0
product_name             0
category_path            0
price                    0
user_rating              0
dtype: int64



In [5]:
noNanSessionsDF = sessionsDF.dropna()
noNanSessionsDF.count()/sessionsDF.count()


session_id          0.087042
timestamp           0.087042
user_id             0.087042
product_id          0.087042
event_type          0.087042
offered_discount    0.087042
purchase_id         1.000000
dtype: float64

In [6]:
statsDF = mergedDF.groupby(pd.Grouper(key="timestamp", axis=0, freq="1D", sort=True))

productsViewStats = statsDF["product_id"].count().describe()
print(f"Products view stats\n{productsViewStats}\n")
productsBuyStats = statsDF["purchase_id"].count().describe()
print(f"Purchase stats\n{productsBuyStats}\n")

Products view stats
count    300.000000
mean     417.153333
std       62.835320
min      233.000000
25%      372.000000
50%      414.500000
75%      456.250000
max      632.000000
Name: product_id, dtype: float64

Purchase stats
count    300.000000
mean      36.310000
std        6.030072
min       23.000000
25%       32.000000
50%       36.000000
75%       40.000000
max       59.000000
Name: purchase_id, dtype: float64



In [7]:
usersDF.street

0             al. Kamienna 87
1           ul. Szpitalna 638
2              ulica Dolna 26
3         ul. Osiedlowa 23/03
4       aleja Krakowska 84/76
                ...          
195           ul. Pogodna 975
196          plac Rolna 73/65
197       plac Piaskowa 83/06
198    plac Storczykowa 45/59
199          plac Zielona 336
Name: street, Length: 200, dtype: object

In [8]:
productsStatsDF = productsDF.groupby(["category_path"])
display(productsDF.category_path.value_counts())
display(productsStatsDF.price.describe())
display(productsDF.loc[productsDF["price"] == productsDF["price"].min()])
display(productsDF.loc[productsDF["price"] < 1])



Gry i konsole;Gry komputerowe                                        202
Gry i konsole;Gry na konsole;Gry Xbox 360                             32
Sprzęt RTV;Video;Telewizory i akcesoria;Anteny RTV                    30
Komputery;Monitory;Monitory LCD                                       17
Komputery;Drukarki i skanery;Biurowe urządzenia wielofunkcyjne         9
Gry i konsole;Gry na konsole;Gry PlayStation3                          9
Telefony i akcesoria;Akcesoria telefoniczne;Zestawy głośnomówiące      5
Telefony i akcesoria;Akcesoria telefoniczne;Zestawy słuchawkowe        4
Telefony i akcesoria;Telefony komórkowe                                2
Sprzęt RTV;Video;Odtwarzacze DVD                                       2
Komputery;Tablety i akcesoria;Tablety                                  2
Sprzęt RTV;Przenośne audio i video;Odtwarzacze mp3 i mp4               2
Telefony i akcesoria;Telefony stacjonarne                              1
Sprzęt RTV;Video;Telewizory i akcesoria;Okulary 3D 

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
category_path,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Gry i konsole;Gry komputerowe,202.0,37.995248,32.529106,1.0,18.405,31.945,39.99,199.9
Gry i konsole;Gry na konsole;Gry PlayStation3,9.0,87.63,22.287582,59.9,79.9,79.9,99.99,129.0
Gry i konsole;Gry na konsole;Gry Xbox 360,32.0,76.674375,28.442363,14.99,49.99,74.945,89.99,139.99
Komputery;Drukarki i skanery;Biurowe urządzenia wielofunkcyjne,9.0,4238.504444,1872.985317,1998.14,2399.0,4598.0,5259.0,7639.0
Komputery;Monitory;Monitory LCD,17.0,955.805294,693.800589,269.0,609.0,739.0,1079.0,3029.0
Komputery;Tablety i akcesoria;Tablety,2.0,2066.995,353.588746,1816.97,1941.9825,2066.995,2192.0075,2317.02
Sprzęt RTV;Audio;Słuchawki,1.0,553.0,,553.0,553.0,553.0,553.0,553.0
Sprzęt RTV;Przenośne audio i video;Odtwarzacze mp3 i mp4,2.0,71.85,9.970206,64.8,68.325,71.85,75.375,78.9
Sprzęt RTV;Video;Odtwarzacze DVD,2.0,151.0,59.39697,109.0,130.0,151.0,172.0,193.0
Sprzęt RTV;Video;Telewizory i akcesoria;Anteny RTV,30.0,110.775667,47.243374,29.99,79.99,106.0,133.5,219.0


Unnamed: 0,product_id,product_name,category_path,price,user_rating
140,1141,Król Futbolu Piłkarski Quiz (PC),Gry i konsole;Gry komputerowe,1.0,3.462897
192,1193,Heroes Over Europe (PC),Gry i konsole;Gry komputerowe,1.0,4.549431
271,1272,The Ball (PC),Gry i konsole;Gry komputerowe,1.0,2.286441


Unnamed: 0,product_id,product_name,category_path,price,user_rating


In [9]:
productsDF.describe()

Unnamed: 0,product_id,price,user_rating
count,319.0,319.0,319.0
mean,1160.0,247.787962,2.648154
std,92.231593,813.788548,1.488174
min,1001.0,1.0,0.013905
25%,1080.5,24.99,1.342497
50%,1160.0,41.0,2.780512
75%,1239.5,92.995,3.932414
max,1319.0,7639.0,4.993596


In [19]:
sessionsDF

Unnamed: 0,session_id,timestamp,user_id,product_id,event_type,offered_discount,purchase_id
0,124,2021-05-13 06:07:09,102,1317,VIEW_PRODUCT,0,
1,125,2021-09-11 01:25:08,102,1170,VIEW_PRODUCT,0,
2,125,2021-09-11 01:29:38,102,1055,VIEW_PRODUCT,0,
3,125,2021-09-11 01:30:44,102,1053,VIEW_PRODUCT,0,
4,125,2021-09-11 01:33:50,102,1060,VIEW_PRODUCT,0,
...,...,...,...,...,...,...,...
125141,21382,2021-06-15 13:52:46,301,1054,VIEW_PRODUCT,15,
125142,21382,2021-06-15 13:53:42,301,1268,VIEW_PRODUCT,15,
125143,21382,2021-06-15 13:58:33,301,1056,VIEW_PRODUCT,15,
125144,21382,2021-06-15 13:59:46,301,1057,VIEW_PRODUCT,15,


Badanie gęstości interkacji Klient - Produkt

In [37]:
df = sessionsDF.drop(columns=["session_id", "timestamp", "event_type", "offered_discount", "purchase_id"])
df["hit"] = 1
heatMapDF = pd.pivot_table(df, index="user_id", columns="product_id", values="hit")
display(heatMapDF)
print("Gęstość macierzy: " + str((heatMapDF.size - heatMapDF.isna().sum().sum())/heatMapDF.size))


product_id,1001,1002,1003,1004,1005,1006,1007,1008,1009,1010,...,1310,1311,1312,1313,1314,1315,1316,1317,1318,1319
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
102,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,,,1.0,1.0,1.0,,1.0
103,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,,1.0,,1.0,1.0,1.0,1.0,1.0
104,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,,1.0,,,1.0,1.0,1.0,,1.0
105,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,...,,1.0,,,,1.0,1.0,1.0,,1.0
106,,1.0,,,,,,1.0,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,,,1.0,1.0,1.0,1.0,1.0,1.0
298,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,1.0,1.0,,,,1.0,,1.0,1.0
299,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,1.0
300,,1.0,1.0,1.0,1.0,1.0,1.0,,1.0,1.0,...,,1.0,,,,,1.0,,,


Gęstość macierzy: 0.47782131661442007


Badanie korelacji