# Analýza nákupního košíku

In [108]:
import itertools
import pathlib

import pandas as pd
import pyreadstat

pd.options.plotting.backend = "hvplot"

---

## Příprava datové matice

### Nalezení a načtení SPSS dat. Dostáváme pandas DataFrame a metadata.

In [10]:
PATH = pathlib.Path()
PATH_SHOPPING_ITEMS = PATH / ".." / "data" / "Shopping_items.sav"
df, meta = pyreadstat.read_sav(PATH_SHOPPING_ITEMS)
print(type(df))
print(type(meta))

<class 'pandas.core.frame.DataFrame'>
<class 'pyreadstat._readstat_parser.metadata_container'>


### Užitečná metadata

In [3]:
print("Počet řádků:\t", meta.number_rows)
print("Počet sloupců:\t", meta.number_columns, "... konkrétně:", meta.column_names)

Počet řádků:	 2395
Počet sloupců:	 2 ... konkrétně: ['ID', 'ITEM']


### Náhled datové matice

In [11]:
df.head(10)

Unnamed: 0,ID,ITEM
0,1.0,READMADE
1,1.0,SNACKS
2,2.0,READMADE
3,2.0,TOILETRY
4,3.0,READMADE
5,3.0,TOILETRY
6,3.0,SNACKS
7,4.0,READMADE
8,4.0,MILK
9,4.0,BAKERY


### Jaké máme datové typy?

In [5]:
df.dtypes

ID      float64
ITEM     object
dtype: object

In [50]:
ITEM_TYPES = df["ITEM"].unique()
ITEM_TYPES

array(['READMADE', 'SNACKS', 'TOILETRY', 'MILK', 'BAKERY', 'TINNED',
       'FROZEN', 'ALCOHOL', 'VEG', 'MEAT'], dtype=object)

### Distribution plot – co se jak moc nakupuje

In [7]:
df["ITEM"].value_counts().plot(kind="bar")

In [8]:
df["ITEM"].value_counts(normalize=True).mul(100).round(2).astype(str) + " %"

ITEM
READMADE    16.16 %
SNACKS      15.57 %
TINNED      14.95 %
BAKERY      14.07 %
FROZEN      13.19 %
ALCOHOL     12.94 %
MILK         6.18 %
TOILETRY     3.26 %
VEG          2.71 %
MEAT         0.96 %
Name: proportion, dtype: object

### Restrukturalizace na příznaky

In [44]:
flags = pd.get_dummies(df, columns=["ITEM"], prefix="", prefix_sep="").groupby(["ID"]).sum()
flags

Unnamed: 0_level_0,ALCOHOL,BAKERY,FROZEN,MEAT,MILK,READMADE,SNACKS,TINNED,TOILETRY,VEG
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1.0,0,0,0,0,0,1,1,0,0,0
2.0,0,0,0,0,0,1,0,0,1,0
3.0,0,0,0,0,0,1,1,0,1,0
4.0,0,1,0,0,1,1,0,0,0,0
5.0,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
782.0,1,0,0,0,0,0,0,0,0,0
783.0,0,1,1,0,0,1,1,0,0,0
784.0,1,0,0,0,0,0,0,0,0,0
785.0,1,0,1,0,0,0,1,0,0,0


#### Pouze řádky s alkoholem

In [100]:
flags_alcohol = flags[flags["ALCOHOL"] == 1]
#n_alcohol_rows = len(flags_alcohol.index)
flags_alcohol

Unnamed: 0_level_0,ALCOHOL,BAKERY,FROZEN,MEAT,MILK,READMADE,SNACKS,TINNED,TOILETRY,VEG
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
12.0,1,1,1,0,1,1,1,0,0,0
16.0,1,0,0,0,0,1,0,0,0,0
26.0,1,0,0,0,1,1,1,0,0,0
30.0,1,1,0,0,0,1,0,0,0,0
50.0,1,0,1,0,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...
780.0,1,0,1,0,0,0,0,0,0,0
781.0,1,0,1,0,0,0,0,1,1,0
782.0,1,0,0,0,0,0,0,0,0,0
784.0,1,0,0,0,0,0,0,0,0,0


---

## Analýza dat

### Pavučinový diagram

#### Alkohol

In [104]:
flags_alcohol_sum = flags_alcohol.sum()
n_alcohol_connections = flags_alcohol_sum.sum() - flags_alcohol_sum["ALCOHOL"]
print(flags_alcohol_sum, "\n\n", n_alcohol_connections, "položek bylo v košíku společně s alkoholem")

ALCOHOL     310
BAKERY      169
FROZEN      181
MEAT         16
MILK         90
READMADE    167
SNACKS      172
TINNED      136
TOILETRY     47
VEG          36
dtype: int64 

 1014 položek bylo v košíku společně s alkoholem


In [105]:
G_web_alcohol = nx.Graph()

# Uzly grafu
G_web_alcohol.add_nodes_from(ITEM_TYPES)
nodes_pos = nx.layout.circular_layout(G_web_alcohol)
nodes = hvnx.draw_networkx_nodes(G_web_alcohol, nodes_pos, node_color="blue")

# Popisky uzlů
LABEL_OFFSET = 0.1
label_pos = {k: [v[0], v[1] + LABEL_OFFSET] for k, v in nodes_pos.items()}
labels = hvnx.draw_networkx_labels(G_web_alcohol, label_pos)

# Hrany
edges_data = [(x, "ALCOHOL") for x in ITEM_TYPES if x != "ALCOHOL"]
G_web_alcohol.add_edges_from(edges_data)
edge_colors = [flags_alcohol_sum[x[0]] / n_alcohol_connections * 100 for x in edges_data]
edges = hvnx.draw_networkx_edges(G_web_alcohol, nodes_pos, edge_color=edge_colors, colorbar=True)

nodes * labels * edges

#### Vše

In [111]:
item_pairs = list(itertools.combinations(ITEM_TYPES, 2))
item_pairs_quantities = []
total_pairs = 0
for pair in item_pairs:
    quantity = len(flags[(flags[pair[0]] == 1) & (flags[pair[1]] == 1)].index)
    item_pairs_quantities.append(quantity)
    total_pairs += quantity
    
# ---
G_web_all = nx.Graph()

# Uzly grafu
G_web_all.add_nodes_from(ITEM_TYPES)
nodes_pos = nx.layout.circular_layout(G_web_all)
nodes = hvnx.draw_networkx_nodes(G_web_all, nodes_pos, node_color="blue")

# Popisky uzlů
LABEL_OFFSET = 0.1
label_pos = {k: [v[0], v[1] + LABEL_OFFSET] for k, v in nodes_pos.items()}
labels = hvnx.draw_networkx_labels(G_web_all, label_pos)

# Hrany
edges_data = item_pairs
G_web_all.add_edges_from(edges_data)
edge_colors = item_pairs_quantities
edges = hvnx.draw_networkx_edges(G_web_all, nodes_pos, edge_color=edge_colors, colorbar=True)

nodes * labels * edges

#### Heatmap?