Load cleaned files + ساخت basket ها

In [5]:
import pandas as pd
import gc

CLEAN_ORDER_PRODUCTS_PATH = "out/order_products_clean.csv"
CLEAN_ORDERS_PATH = "out/orders_clean.csv"
PRODUCTS_PATH = "products.csv"

# Load cleaned data (small now, fast)
orders_clean = pd.read_csv(CLEAN_ORDERS_PATH)
order_products_clean = pd.read_csv(CLEAN_ORDER_PRODUCTS_PATH)

# Ensure types (memory + groupby speed)
order_products_clean["order_id"] = order_products_clean["order_id"].astype("int32")
order_products_clean["product_id"] = order_products_clean["product_id"].astype("int32")

# Build baskets: order_id -> list of product_ids
baskets = (
    order_products_clean
    .groupby("order_id")["product_id"]
    .apply(list)
)

print("Number of baskets:", baskets.shape[0])
print("Example basket:", baskets.iloc[0][:20])


Number of baskets: 14132
Example basket: [17889, 9292, 15424, 18988, 38959]


ساخت جدول transactions

In [6]:
transactions_df = baskets.reset_index()
transactions_df.columns = ["order_id", "items"]
transactions_df["basket_size"] = transactions_df["items"].apply(len).astype("int16")

print("transactions_df shape:", transactions_df.shape)
print("basket_size min/max/mean:",
      transactions_df["basket_size"].min(),
      transactions_df["basket_size"].max(),
      round(transactions_df["basket_size"].mean(), 2))

transactions_df.head()


transactions_df shape: (14132, 3)
basket_size min/max/mean: 2 74 10.09


Unnamed: 0,order_id,items,basket_size
0,64,"[17889, 9292, 15424, 18988, 38959]",5
1,176,"[5876, 26497, 17872, 4675, 21267, 22825, 41588...",10
2,178,"[49235, 6369]",2
3,504,"[19894, 2237, 6628, 49533, 5025, 15392, 19678,...",20
4,506,"[32768, 24852, 47626, 5818, 35042, 38379, 3150...",13


اضافه کردن اسم کالا برای خوانایی گزارش

In [7]:
products = pd.read_csv(PRODUCTS_PATH, usecols=["product_id", "product_name"])
pid_to_name = dict(zip(products["product_id"], products["product_name"]))

def map_names(item_list):
    return [pid_to_name.get(pid, str(pid)) for pid in item_list]

transactions_df["item_names"] = transactions_df["items"].apply(map_names)

print("Example named basket:", transactions_df.loc[0, "item_names"][:15])


Example named basket: ['Caramel Vanilla Cream Light Roast K-Cup Packs Arabica Coffee', 'Half And Half Ultra Pasteurized', 'Purified Water', 'Ultra Soft & Strong® Toilet Paper Double Rolls', 'Smoothies, Strawberries Wild']


ذخیره خروجی Task 2

In [8]:
OUT_TRANSACTIONS_PATH = "out/transactions.csv"
transactions_df.to_csv(OUT_TRANSACTIONS_PATH, index=False)
print("Saved:", OUT_TRANSACTIONS_PATH)

# free memory
del order_products_clean
gc.collect()


Saved: out/transactions.csv


82