# Laboratorium 1 - analiza koszykowa

## Przygotowanie

 * pobierz i wypakuj dataset: https://kaggle.com/datasets/rashikrahmanpritom/groceries-dataset-for-market-basket-analysismba?resource=download&select=basket.csv
   * alternatywnie, pobierz plik `basket.csv` z Teamsów
 * [opcjonalnie] Utwórz wirtualne środowisko
 `python3 -m venv ./recsyslab1`
 * zainstaluj potrzebne biblioteki:
 `pip install more-itertools`

## Część 1. - przygotowanie danych

In [1]:
# importujemy wszystkie potrzebne pakiety

from more_itertools import powerset
from tqdm import tqdm

In [2]:
# definiujemy stale

PATH = './basket.csv'
EPSILON = 0.001

In [3]:
# wczytujemy dane o koszykach

def read_baskets(path: str) -> list[set[str]]:
    with open(path) as f:
        raw = f.read()

    return [set([y.lower() for y in x.split(',') if y]) for x in raw.split('\n')[1:] if x]


def unique_products(baskets: list[set[str]]) -> list[str]:
    products = set()
    for basket in baskets:
        products.update(basket)

    return sorted(list(products))


baskets = read_baskets(PATH)
products = unique_products(baskets)

In [4]:
products[:10]

['abrasive cleaner',
 'artif. sweetener',
 'baby cosmetics',
 'bags',
 'baking powder',
 'bathroom cleaner',
 'beef',
 'berries',
 'beverages',
 'bottled beer']

In [5]:
baskets[:10]

[{'pastry', 'salty snack', 'whole milk'},
 {'sausage', 'semi-finished bread', 'whole milk', 'yogurt'},
 {'pickled vegetables', 'soda'},
 {'canned beer', 'misc. beverages'},
 {'hygiene articles', 'sausage'},
 {'rolls/buns', 'sausage', 'whole milk'},
 {'soda', 'whole milk'},
 {'frankfurter', 'soda', 'whipped/sour cream'},
 {'curd', 'frankfurter'},
 {'beef', 'white bread'}]

## Część 2. - obliczanie wskaźników

In [6]:
def calculate_support(basket: set[str], all_baskets: list[set[str]], all_baskets_count: int) -> tuple[
    float, list[set[str]]]:
    """
    Calculate the support of a basket in a collection of baskets.

    Args:
        basket (set[str]): A set of products in the basket.
        all_baskets (list[set[str]]): A list of all baskets.
        all_baskets_count (int): The total number of baskets.

    Returns:
        tuple[float, list[set[str]]]: A tuple containing the support value as a float and a list of interesting baskets.
    """
    interesting_baskets = [b for b in all_baskets if len(b) >= len(basket) and basket.issubset(b)]
    support = len(interesting_baskets) / all_baskets_count

    return support, interesting_baskets

In [7]:
def get_supports_recursive(supports: dict, all_baskets: list[set[str]], all_products: list[str], epsilon: float,
                           basket: list[str], all_baskets_count: int) -> None:
    """
    Recursively calculate and update support values for baskets and products.

    Args:
        supports (dict): A dictionary to store support values.
        all_baskets (list[set[str]]): A list of all baskets.
        all_products (list[str]): A list of all products.
        epsilon (float): A threshold for support.
        basket (list[str]): A list of products in the current basket.
        all_baskets_count (int): The total number of baskets.

    Returns:
        None
    """
    support, baskets = calculate_support(set(basket), all_baskets, all_baskets_count)
    frozen_basket = frozenset(basket)

    if support > epsilon:
        supports[frozen_basket] = support
    else:
        supports[frozen_basket] = epsilon
        return

    for product in all_products[all_products.index(basket[-1]):]:
        if product not in basket:
            get_supports_recursive(supports, baskets, all_products, epsilon, basket + [product], all_baskets_count)

In [8]:
# obliczamy strukture danych (np. slownik albo graf) przechowujaca wszystkie interesujace wartosci `support`
def get_supports(all_baskets: list[set[str]], all_products: list[str], epsilon: float) -> dict:
    """
    Calculate the support values for all possible baskets in a collection.

    Args:
        all_baskets (list[set[str]]): A list of all baskets.
        all_products (list[str]): A list of all products.
        epsilon (float): A threshold for support.

    Returns:
        dict: A dictionary containing support values for different baskets.
    """
    supports = {}

    for product in tqdm(all_products, desc='Creating baskets'):
        get_supports_recursive(supports, all_baskets, all_products, epsilon, [product], len(all_baskets))

    return supports


supports = get_supports(baskets, products, EPSILON)
supports

Creating baskets: 100%|██████████| 167/167 [00:00<00:00, 281.80it/s]


{frozenset({'abrasive cleaner'}): 0.0014702933903628951,
 frozenset({'abrasive cleaner', 'artif. sweetener'}): 0.001,
 frozenset({'abrasive cleaner', 'baby cosmetics'}): 0.001,
 frozenset({'abrasive cleaner', 'bags'}): 0.001,
 frozenset({'abrasive cleaner', 'baking powder'}): 0.001,
 frozenset({'abrasive cleaner', 'bathroom cleaner'}): 0.001,
 frozenset({'abrasive cleaner', 'beef'}): 0.001,
 frozenset({'abrasive cleaner', 'berries'}): 0.001,
 frozenset({'abrasive cleaner', 'beverages'}): 0.001,
 frozenset({'abrasive cleaner', 'bottled beer'}): 0.001,
 frozenset({'abrasive cleaner', 'bottled water'}): 0.001,
 frozenset({'abrasive cleaner', 'brandy'}): 0.001,
 frozenset({'abrasive cleaner', 'brown bread'}): 0.001,
 frozenset({'abrasive cleaner', 'butter'}): 0.001,
 frozenset({'abrasive cleaner', 'butter milk'}): 0.001,
 frozenset({'abrasive cleaner', 'cake bar'}): 0.001,
 frozenset({'abrasive cleaner', 'candles'}): 0.001,
 frozenset({'abrasive cleaner', 'candy'}): 0.001,
 frozenset({'abr

In [9]:
# definiujemy funkcje obliczajace support, confidence i lift
def support(supports: dict[frozenset[str], float], products: set[str]) -> float:
    """
    Calculate the support for a set of products in a dictionary of support values.

    Args:
        supports (dict[frozenset[str], float]): A dictionary of support values for different product combinations.
        products (set[str]): A set of products for which support is calculated.

    Returns:
        float: The support value.
    """
    return supports.get(frozenset(products), EPSILON)


def confidence(supports: dict[frozenset[str], float], prior_products: set[str], following_products: set[str]) -> float:
    """
    Calculate the confidence of a set of following products given a set of prior products.

    Args:
        supports (dict[frozenset[str], float]): A dictionary of support values for different product combinations.
        prior_products (set[str]): A set of products that occurred before the following products.
        following_products (set[str]): A set of products that occurred after the prior products.

    Returns:
        float: The confidence value.
    """
    return support(supports, prior_products | following_products) / support(supports, prior_products)


def lift(supports: dict[frozenset[str], float], prior_products: set[str], following_products: set[str]) -> float:
    """
    Calculate the lift of a set of following products given a set of prior products.

    Args:
        supports (dict[frozenset[str], float]): A dictionary of support values for different product combinations.
        prior_products (set[str]): A set of products that occurred before the following products.
        following_products (set[str]): A set of products that occurred after the prior products.

    Returns:
        float: The lift value.
    """
    return support(supports, prior_products | following_products) / (
            support(supports, prior_products) * support(supports, following_products))

In [10]:
print(f"Support: {support(supports, {'whole milk', 'rolls/buns'})}")
print(f"Confidence: {confidence(supports, {'whole milk', 'rolls/buns'}, {'yogurt'})}")
print(f"Lift: {lift(supports, {'whole milk', 'rolls/buns'}, {'yogurt'})}")

Support: 0.013967787208447505
Confidence: 0.09569377990430622
Lift: 1.1142926293448512


## Część 3. - generowanie rekomendacji

In [11]:
# wyznaczamy liste potencjalnych rekomendacji
# rekomendowane artykuly powinny miec lift > 1 i mozliwie wysokie confidence

def generate_next_product_candidates(basket: set[str], products: list[str], supports: dict) -> list[
    tuple[str, tuple[str], float, float]]:
    """
    Generate candidate products for the next purchase based on the current basket.

    Args:
        basket (set[str]): A set of products in the current basket.
        products (list[str]): A list of all available products.
        supports (dict): A dictionary of support values for product combinations.

    Returns:
        list[tuple[str, tuple[str], float, float]]: A list of tuples, each containing a candidate product, the subset of products in the current basket,
                                                    the confidence value, and the lift value. The list is sorted by confidence in ascending order.
    """
    recommendation = []

    for subset in powerset(basket):
        if not subset:
            continue

        sub_basket = set(subset)
        for item in products:
            if item not in basket:
                confidence_value = confidence(supports, sub_basket, {item})
                lift_value = lift(supports, sub_basket, {item})
                if lift_value > 1:
                    recommendation.append((item, sub_basket, confidence_value, lift_value))

    return sorted(recommendation, key=lambda x: x[2])

In [12]:
print(baskets[1])
generate_next_product_candidates(baskets[1], products, supports)

{'whole milk', 'semi-finished bread', 'sausage', 'yogurt'}


[('abrasive cleaner', {'whole milk'}, 0.006332204824375794, 4.306762763051592),
 ('artif. sweetener', {'whole milk'}, 0.006332204824375794, 3.267199337487414),
 ('baby cosmetics', {'whole milk'}, 0.006332204824375794, 6.332204824375794),
 ('bags', {'whole milk'}, 0.006332204824375794, 6.332204824375794),
 ('bathroom cleaner', {'whole milk'}, 0.006332204824375794, 5.573457693360883),
 ('brandy', {'whole milk'}, 0.006332204824375794, 2.4933889680825003),
 ('cake bar', {'whole milk'}, 0.006332204824375794, 1.0298780520340762),
 ('candles', {'whole milk'}, 0.006332204824375794, 1.4355875876838637),
 ('canned fruit', {'whole milk'}, 0.006332204824375794, 4.511846704149287),
 ('canned vegetables',
  {'whole milk'},
  0.006332204824375794,
  1.1554729364284757),
 ('cereals', {'whole milk'}, 0.006332204824375794, 2.2559233520746433),
 ('chocolate marshmallow',
  {'whole milk'},
  0.006332204824375794,
  1.57914634645225),
 ('cleaner', {'whole milk'}, 0.006332204824375794, 3.056412283455968),
 

In [13]:
print(baskets[33])
generate_next_product_candidates(baskets[33], products, supports)

{'yogurt', 'tropical fruit', 'domestic eggs', 'root vegetables', 'soda', 'white wine', 'photo/film'}


[('abrasive cleaner', {'soda'}, 0.010298004129387476, 7.004047081273855),
 ('artif. sweetener', {'soda'}, 0.010298004129387476, 5.313415027173268),
 ('baby cosmetics', {'soda'}, 0.010298004129387476, 10.298004129387476),
 ('bags', {'soda'}, 0.010298004129387476, 10.298004129387476),
 ('baking powder', {'soda'}, 0.010298004129387476, 1.2734631056861552),
 ('bathroom cleaner', {'soda'}, 0.010298004129387476, 9.064060928707342),
 ('brandy', {'soda'}, 0.010298004129387476, 4.054974626000653),
 ('cake bar', {'soda'}, 0.010298004129387476, 1.6748808237828778),
 ('candles', {'soda'}, 0.010298004129387476, 2.334682360424618),
 ('canned fish', {'soda'}, 0.010298004129387476, 1.3399046590263024),
 ('canned fruit', {'soda'}, 0.010298004129387476, 7.337573132763087),
 ('canned vegetables', {'soda'}, 0.010298004129387476, 1.8791345827807904),
 ('cereals', {'soda'}, 0.010298004129387476, 3.6687865663815433),
 ('chocolate marshmallow', {'soda'}, 0.010298004129387476, 2.56815059646708),
 ('cleaner', {