# Analýza nákupního košíku

In [45]:
import itertools
import pathlib

import hvplot.networkx as hvnx
import networkx as nx
import pandas as pd
import pyreadstat

from mlxtend.frequent_patterns import apriori, association_rules

pd.options.plotting.backend = "hvplot"

---

## Příprava datové matice

### Nalezení a načtení SPSS dat. Dostáváme pandas DataFrame a metadata.

In [2]:
PATH = pathlib.Path()
PATH_SHOPPING_ITEMS = PATH / ".." / "data" / "Shopping_items.sav"
df, meta = pyreadstat.read_sav(PATH_SHOPPING_ITEMS)
print(type(df))
print(type(meta))

<class 'pandas.core.frame.DataFrame'>
<class 'pyreadstat._readstat_parser.metadata_container'>


### Užitečná metadata

In [3]:
print("Počet řádků:\t", meta.number_rows)
print("Počet sloupců:\t", meta.number_columns, "... konkrétně:", meta.column_names)

Počet řádků:	 2395
Počet sloupců:	 2 ... konkrétně: ['ID', 'ITEM']


### Náhled datové matice

In [4]:
df.head(10)

Unnamed: 0,ID,ITEM
0,1.0,READMADE
1,1.0,SNACKS
2,2.0,READMADE
3,2.0,TOILETRY
4,3.0,READMADE
5,3.0,TOILETRY
6,3.0,SNACKS
7,4.0,READMADE
8,4.0,MILK
9,4.0,BAKERY


### Jaké máme datové typy?

In [5]:
df.dtypes

ID      float64
ITEM     object
dtype: object

In [6]:
ITEM_TYPES = df["ITEM"].unique()
ITEM_TYPES

array(['READMADE', 'SNACKS', 'TOILETRY', 'MILK', 'BAKERY', 'TINNED',
       'FROZEN', 'ALCOHOL', 'VEG', 'MEAT'], dtype=object)

### Distribution plot – co se jak moc nakupuje

In [7]:
df["ITEM"].value_counts().plot(kind="bar")

In [8]:
df["ITEM"].value_counts(normalize=True).mul(100).round(2).astype(str) + " %"

ITEM
READMADE    16.16 %
SNACKS      15.57 %
TINNED      14.95 %
BAKERY      14.07 %
FROZEN      13.19 %
ALCOHOL     12.94 %
MILK         6.18 %
TOILETRY     3.26 %
VEG          2.71 %
MEAT         0.96 %
Name: proportion, dtype: object

### Restrukturalizace na příznaky

In [9]:
flags = pd.get_dummies(df, columns=["ITEM"], prefix="", prefix_sep="").groupby(["ID"]).sum()
flags

Unnamed: 0_level_0,ALCOHOL,BAKERY,FROZEN,MEAT,MILK,READMADE,SNACKS,TINNED,TOILETRY,VEG
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1.0,0,0,0,0,0,1,1,0,0,0
2.0,0,0,0,0,0,1,0,0,1,0
3.0,0,0,0,0,0,1,1,0,1,0
4.0,0,1,0,0,1,1,0,0,0,0
5.0,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
782.0,1,0,0,0,0,0,0,0,0,0
783.0,0,1,1,0,0,1,1,0,0,0
784.0,1,0,0,0,0,0,0,0,0,0
785.0,1,0,1,0,0,0,1,0,0,0


#### Pouze řádky s alkoholem

In [10]:
flags_alcohol = flags[flags["ALCOHOL"] == 1]
#n_alcohol_rows = len(flags_alcohol.index)
flags_alcohol

Unnamed: 0_level_0,ALCOHOL,BAKERY,FROZEN,MEAT,MILK,READMADE,SNACKS,TINNED,TOILETRY,VEG
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
12.0,1,1,1,0,1,1,1,0,0,0
16.0,1,0,0,0,0,1,0,0,0,0
26.0,1,0,0,0,1,1,1,0,0,0
30.0,1,1,0,0,0,1,0,0,0,0
50.0,1,0,1,0,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...
780.0,1,0,1,0,0,0,0,0,0,0
781.0,1,0,1,0,0,0,0,1,1,0
782.0,1,0,0,0,0,0,0,0,0,0
784.0,1,0,0,0,0,0,0,0,0,0


---

## Analýza dat

### Pavučinový diagram

#### Alkohol

In [11]:
flags_alcohol_sum = flags_alcohol.sum()
n_alcohol_connections = flags_alcohol_sum.sum() - flags_alcohol_sum["ALCOHOL"]
print(flags_alcohol_sum, "\n\n", n_alcohol_connections, "položek bylo v košíku společně s alkoholem")

ALCOHOL     310
BAKERY      169
FROZEN      181
MEAT         16
MILK         90
READMADE    167
SNACKS      172
TINNED      136
TOILETRY     47
VEG          36
dtype: int64 

 1014 položek bylo v košíku společně s alkoholem


In [16]:
G_web_alcohol = nx.Graph()

# Uzly grafu
G_web_alcohol.add_nodes_from(ITEM_TYPES)
nodes_pos = nx.layout.circular_layout(G_web_alcohol)
nodes = hvnx.draw_networkx_nodes(G_web_alcohol, nodes_pos, node_color="blue")

# Popisky uzlů
LABEL_OFFSET = 0.1
label_pos = {k: [v[0], v[1] + LABEL_OFFSET] for k, v in nodes_pos.items()}
labels = hvnx.draw_networkx_labels(G_web_alcohol, label_pos)

# Hrany
edges_data = [(x, "ALCOHOL") for x in ITEM_TYPES if x != "ALCOHOL"]
G_web_alcohol.add_edges_from(edges_data)
edge_colors = [flags_alcohol_sum[x[0]] / n_alcohol_connections * 100 for x in edges_data]
edges = hvnx.draw_networkx_edges(G_web_alcohol, nodes_pos, edge_color=edge_colors, colorbar=True)

nodes * labels * edges

#### Vše

In [31]:
WEB_RANGE_MIN = 0
WEB_RANGE_MAX = 250

item_pairs_all = list(itertools.combinations(ITEM_TYPES, 2))
item_pairs_quantities_all = []
item_pairs = []
item_pairs_quantities = []
total_pairs = 0
for pair in item_pairs_all:
    quantity = len(flags[(flags[pair[0]] == 1) & (flags[pair[1]] == 1)].index)
    item_pairs_quantities_all.append(quantity)
    if quantity >= WEB_RANGE_MIN and quantity <= WEB_RANGE_MAX:
        item_pairs.append(pair)
        item_pairs_quantities.append(quantity)
        total_pairs += quantity

# Případný výpis všech dvojic zboží a kolikrát se spolu v košíku vyskytly
#print(list(zip(item_pairs, item_pairs_quantities)))

G_web_all = nx.Graph()

# Uzly grafu
G_web_all.add_nodes_from(ITEM_TYPES)
nodes_pos = nx.layout.circular_layout(G_web_all)
nodes = hvnx.draw_networkx_nodes(G_web_all, nodes_pos, node_color="blue")

# Popisky uzlů
LABEL_OFFSET = 0.1
label_pos = {k: [v[0], v[1] + LABEL_OFFSET] for k, v in nodes_pos.items()}
labels = hvnx.draw_networkx_labels(G_web_all, label_pos)

# Hrany
edges_data = item_pairs
G_web_all.add_edges_from(edges_data)
edge_colors = item_pairs_quantities
edges = hvnx.draw_networkx_edges(G_web_all, nodes_pos, edge_color=edge_colors, colorbar=True)

nodes * labels * edges

---

## Modelování

### APRIORI

In [72]:
MINIMUM_ANTECEDENT_SUPPORT = 0.1
MINIMUM_RULE_CONFIDENCE = 0.60

flags_bool = flags.map(bool)
item_frequencies = apriori(flags_bool, min_support=MINIMUM_ANTECEDENT_SUPPORT, use_colnames=True)
a_rules = (
    association_rules(item_frequencies, metric="confidence", min_threshold=MINIMUM_RULE_CONFIDENCE)
    .drop(columns=["representativity"])
    .sort_values(by="confidence", ascending=False)
)

print("Počet pravidel:", len(a_rules.index))
a_rules

Počet pravidel: 57


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
56,"(TINNED, READMADE, ALCOHOL)",(BAKERY),0.120865,0.428753,0.100509,0.831579,1.939528,0.048688,3.391778,0.551008,0.223796,0.705169,0.533
54,"(BAKERY, TINNED, ALCOHOL)",(READMADE),0.12341,0.492366,0.100509,0.814433,1.65412,0.039746,2.735581,0.451122,0.195062,0.634447,0.509284
32,"(MILK, READMADE)",(BAKERY),0.133588,0.428753,0.105598,0.790476,1.843663,0.048322,2.726405,0.528157,0.231198,0.633217,0.518383
34,"(MILK, TINNED)",(BAKERY),0.127226,0.428753,0.100509,0.79,1.842552,0.04596,2.720223,0.523933,0.22067,0.632383,0.512211
52,"(READMADE, SNACKS, ALCOHOL)",(BAKERY),0.13486,0.428753,0.101781,0.754717,1.76026,0.043959,2.328929,0.499228,0.220386,0.570618,0.496053
31,"(BAKERY, MILK)",(READMADE),0.139949,0.492366,0.105598,0.754545,1.532488,0.036692,2.068137,0.404007,0.200483,0.516473,0.484508
45,"(BAKERY, FROZEN, READMADE)",(ALCOHOL),0.138677,0.394402,0.104326,0.752294,1.907428,0.049631,2.444821,0.552329,0.243323,0.590972,0.508405
1,(MILK),(BAKERY),0.188295,0.428753,0.139949,0.743243,1.733499,0.059217,2.224856,0.521288,0.293333,0.550533,0.534826
51,"(BAKERY, SNACKS, ALCOHOL)",(READMADE),0.137405,0.492366,0.101781,0.740741,1.50445,0.034128,1.958015,0.388717,0.192771,0.489279,0.47373
46,"(BAKERY, FROZEN, ALCOHOL)",(READMADE),0.142494,0.492366,0.104326,0.732143,1.486988,0.034167,1.895165,0.381921,0.196643,0.472342,0.472015


### APRIORI – Alkohol

In [77]:
a_rules_alcohol = a_rules[a_rules["consequents"] == frozenset({"ALCOHOL"})]
a_rules_alcohol

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
45,"(BAKERY, FROZEN, READMADE)",(ALCOHOL),0.138677,0.394402,0.104326,0.752294,1.907428,0.049631,2.444821,0.552329,0.243323,0.590972,0.508405
49,"(BAKERY, READMADE, SNACKS)",(ALCOHOL),0.147583,0.394402,0.101781,0.689655,1.74861,0.043574,1.951371,0.502239,0.231214,0.48754,0.47386
53,"(BAKERY, TINNED, READMADE)",(ALCOHOL),0.14631,0.394402,0.100509,0.686957,1.741767,0.042804,1.934549,0.498859,0.228324,0.483084,0.470898
13,"(FROZEN, READMADE)",(ALCOHOL),0.211196,0.394402,0.143766,0.680723,1.725962,0.06047,1.896779,0.533229,0.311295,0.47279,0.52262
16,"(FROZEN, SNACKS)",(ALCOHOL),0.21374,0.394402,0.142494,0.666667,1.690323,0.058194,1.816794,0.519417,0.306011,0.44958,0.513978
5,"(BAKERY, FROZEN)",(ALCOHOL),0.221374,0.394402,0.142494,0.643678,1.632036,0.055183,1.699581,0.497374,0.301075,0.41162,0.502484
0,(MILK),(ALCOHOL),0.188295,0.394402,0.114504,0.608108,1.541848,0.04024,1.545319,0.43295,0.244565,0.352884,0.449215


---

## Nasazení

In [87]:
in_cart_dict = {
    'READMADE' : True,
    'SNACKS'   : False,
    'TOILETRY' : False,
    'MILK'     : False,
    'BAKERY'   : True,
    'TINNED'   : False,
    'FROZEN'   : True,
    'ALCOHOL'  : False,
    'VEG'      : False,
    'MEAT'     : False
}

in_cart = [k for k, v in in_cart_dict.items() if v]
in_cart

['READMADE', 'BAKERY', 'FROZEN']

### Doporučit alkohol?

In [90]:
def recommend_alcohol(in_cart : list[str], rules : pd.DataFrame) -> bool:
    in_cart_set = set(in_cart)
    for _, row in rules.iterrows():
        if in_cart_set == set(row["antecedents"]):
            return True
    return False


print(recommend_alcohol(in_cart, a_rules_alcohol))

True


### Obecné doporučení

In [89]:
def recommend_sth(in_cart : list[str], rules : pd.DataFrame, n_recommendations : int) -> list[str]:
    """
    [!] Assuming rules are already sorted by confidence
    """
    recommendations = (rules[
        (rules["antecedents"].apply(lambda antecedents: any(product in antecedents for product in in_cart)))
        &
        (rules["consequents"].apply(lambda consequents: not any(product in consequents for product in in_cart)))
    ])
    result = []
    for _, row in recommendations.iterrows():
        for item in row["consequents"]:
            if item not in result:
                result.append(item)
    return result[:n_recommendations]


print(recommend_sth(in_cart, a_rules, 3))

['ALCOHOL', 'TINNED', 'SNACKS']
