In [3]:
import pandas as pd
import re

parts = pd.read_csv("../data/cleaned/part_clean.csv")

parts.head()

Unnamed: 0.1,Unnamed: 0,part_raw,qty,part_clean
0,0,"butelka filtrująca dafi solid 0,7 l szafirowa",1,"butelka filtrująca dafi solid 0,7 l szafirowa"
1,0,filtr węglowy (x1),1,filtr węglowy
2,0,"rurka na filtr do butelki filtrującej dafi solid 0,7 l szafirowym (x1)",1,"rurka na filtr do butelki filtrującej dafi solid 0,7 l szafirowym"
3,0,zestaw 3 filtry do butelki filtrującej dafi soft i solid szafirowy (x1),1,zestaw 3 filtry do butelki filtrującej dafi soft i solid szafirowy
4,1,zestaw 2 filtry,1,zestaw 2 filtry


In [2]:
pd.set_option("display.max_colwidth", None)


In [13]:
'''MAP'''

RULES = {
    "bottle": [r"\bbutelka\b"],
    "pitcher": [r"\bdzbanek\b"],
    "accessory": [r"\brurka\b", r"\bnakretka\b", r"\bnakrętka\b"],
    "filter_set": [r"\bzestaw\b.*\bfiltr", r"\bzestaw\b.*\bwklad", r"\bzestaw\b.*\bwkład"],
    "filter": [r"\bfiltr\b", r"\bwklad\b", r"\bwkład\b"],
}

RULES["filter"] = [r"\bfiltr\b", r"\bfiltry\b", r"\bwklad\b", r"\bwkład\b", r"\bwklady\b", r"\bwkłady\b"]


In [14]:
import re

CATEGORY_ORDER = ["bottle", "pitcher", "accessory", "filter_set", "filter"]

def classify(text):
    for cat in CATEGORY_ORDER:
        patterns = RULES[cat]
        if any(re.search(p, text) for p in patterns):
            return cat
    return "other"


In [20]:
parts["category"] = parts["part_clean"].apply(classify)


In [21]:
parts["category"].value_counts()


category
filter        11504
bottle         8749
filter_set     7587
other          6646
pitcher        4959
accessory       984
Name: count, dtype: int64

In [31]:
def extract_pack_size(text):
    # looks for "3 filtry", "8 filtrów", "2 wkłady", etc.
    m = re.findall(r"\b(\d+)\b\s*(filtr|filtry|filtrow|filtrów|wklad|wkład|wklady|wkłady)", text)
    if m:
        return sum(int(n) for n, _ in m)

    # fallback: "zestaw 3 ..." (if sometimes missing 'filtry' right after)
    m2 = re.search(r"\bzestaw\b\s*(\d+)\b", text)
    if m2:
        return int(m2.group(1))

    return np.nan


In [37]:
parts["pack_size"] = parts["part_clean"].apply(extract_pack_size)

parts["filter_units"] = 0

mask_set = parts["category"].eq("filter_set")
parts.loc[mask_set, "filter_units"] = parts.loc[mask_set, "pack_size"].fillna(1) * parts.loc[mask_set, "qty"]

mask_filter = parts["category"].eq("filter")
parts.loc[mask_filter, "filter_units"] = parts.loc[mask_filter, "pack_size"].fillna(1) * parts.loc[mask_filter, "qty"]


In [38]:
import numpy as np
import re

def has_any(text, patterns):
    return any(re.search(p, text) for p in patterns)

parts["is_wklad"] = parts["part_clean"].apply(lambda s: has_any(s, [r"\bwkład\b", r"\bwkłady\b", r"\bwklady\b", r"\bwklad\b"]))
parts["is_filtr"] = parts["part_clean"].apply(lambda s: has_any(s, [r"\bfiltr\b", r"\bfiltry\b", r"\bfiltrów\b", r"\bfiltrow\b"]))

# Identify destination when it is explicitly written
parts["dest"] = np.where(parts["part_clean"].str.contains(r"\bdo dzban", regex=True), "pitcher",
                 np.where(parts["part_clean"].str.contains(r"\bdo butelk", regex=True), "bottle", "unknown"))

# Pack size (reuse your function)
parts["pack_size"] = parts["part_clean"].apply(extract_pack_size)

# Base units from text: if number exists use it, otherwise 1
parts["units"] = parts["pack_size"].fillna(1) * parts["qty"]


In [39]:
parts["bottle_filter_units"] = 0
parts["pitcher_insert_units"] = 0

is_consumable = parts["category"].isin(["filter", "filter_set"])

# wkład => pitcher inserts (only if consumable line)
mask_wklad = is_consumable & parts["is_wklad"]
parts.loc[mask_wklad, "pitcher_insert_units"] = parts.loc[mask_wklad, "units"]

# filtr => bottle filters (only if consumable line AND not wkład)
mask_filtr = is_consumable & parts["is_filtr"] & (~parts["is_wklad"])
parts.loc[mask_filtr, "bottle_filter_units"] = parts.loc[mask_filtr, "units"]


In [40]:
parts["bottle_included_filter_units"] = 0
parts.loc[parts["category"].eq("bottle"), "bottle_included_filter_units"] = parts.loc[parts["category"].eq("bottle"), "qty"]  # 1 per bottle


In [41]:
parts.loc[
    (parts["is_filtr"] | parts["is_wklad"] | parts["category"].isin(["bottle","pitcher"])),
    ["part_clean","category","qty","dest","bottle_filter_units","pitcher_insert_units","bottle_included_filter_units"]
].head(40)


Unnamed: 0,part_clean,category,qty,dest,bottle_filter_units,pitcher_insert_units,bottle_included_filter_units
0,"butelka filtrująca dafi solid 0,7 l szafirowa",bottle,1,unknown,0,0,1
1,filtr węglowy,filter,1,unknown,1,0,0
2,"rurka na filtr do butelki filtrującej dafi solid 0,7 l szafirowym",accessory,1,bottle,0,0,0
3,zestaw 3 filtry do butelki filtrującej dafi soft i solid szafirowy,filter_set,1,bottle,3,0,0
4,zestaw 2 filtry,filter_set,1,unknown,2,0,0
6,zestaw 3 filtry do butelki filtrującej dafi soft i solid cytrynowy,filter_set,1,bottle,3,0,0
7,"butelka filtrująca dafi soft 0,3 l stalowa",bottle,1,unknown,0,0,1
8,filtr węglowy,filter,1,unknown,1,0,0
9,"butelka filtrująca dafi soft 0,5 l jagodowa",bottle,1,unknown,0,0,1
10,filtr węglowy,filter,1,unknown,1,0,0


In [42]:
parts["bottle_units"] = 0
parts["pitcher_units"] = 0
parts["accessory_units"] = 0
parts["other_units"] = 0

parts.loc[parts["category"].eq("bottle"), "bottle_units"] = parts.loc[parts["category"].eq("bottle"), "qty"]
parts.loc[parts["category"].eq("pitcher"), "pitcher_units"] = parts.loc[parts["category"].eq("pitcher"), "qty"]
parts.loc[parts["category"].eq("accessory"), "accessory_units"] = parts.loc[parts["category"].eq("accessory"), "qty"]
parts.loc[parts["category"].eq("other"), "other_units"] = parts.loc[parts["category"].eq("other"), "qty"]


In [45]:
parts.to_csv("../data/cleaned/decoded.csv")

In [46]:
# columns that identify an order (adjust if your order id column name differs)
ORDER_COLS = ["kolejności", "Anon", "Data zakupu"]

parts = (
    df[ORDER_COLS + ["product_parts"]]
      .explode("product_parts")
      .rename(columns={"product_parts": "part_raw"})
      .copy()
)

parts["part_raw"] = parts["part_raw"].fillna("")
parts["part_clean"] = parts["part_raw"].apply(strip_qty)
parts["qty"] = parts["part_raw"].apply(get_qty)

# re-classify
parts["category"] = parts["part_clean"].apply(classify)

# recompute pack_size + units using your improved extractor
parts["pack_size"] = parts["part_clean"].apply(extract_pack_size)
parts["units"] = parts["pack_size"].fillna(1) * parts["qty"]


NameError: name 'df' is not defined