Import + ورودی‌ها

In [9]:
import numpy as np
import pandas as pd
from scipy.sparse import coo_matrix

CLEAN_ORDER_PRODUCTS_PATH = "out/order_products_clean.csv"

order_products_clean = pd.read_csv(CLEAN_ORDER_PRODUCTS_PATH, usecols=["order_id", "product_id"])
order_products_clean["order_id"] = order_products_clean["order_id"].astype("int32")
order_products_clean["product_id"] = order_products_clean["product_id"].astype("int32")

print("Rows:", len(order_products_clean))
print("Unique orders:", order_products_clean["order_id"].nunique())
print("Unique products:", order_products_clean["product_id"].nunique())


Rows: 142555
Unique orders: 14132
Unique products: 14304


ساخت ماتریس One-Hot به صورت Sparse (CSR)

In [10]:

order_ids = order_products_clean["order_id"].unique()
product_ids = order_products_clean["product_id"].unique()


order_id_to_row = pd.Series(np.arange(len(order_ids), dtype=np.int32), index=order_ids)
product_id_to_col = pd.Series(np.arange(len(product_ids), dtype=np.int32), index=product_ids)


rows = order_id_to_row[order_products_clean["order_id"]].to_numpy()
cols = product_id_to_col[order_products_clean["product_id"]].to_numpy()


data = np.ones(len(order_products_clean), dtype=np.bool_)


X = coo_matrix((data, (rows, cols)), shape=(len(order_ids), len(product_ids))).tocsr()

print("One-hot shape (orders x products):", X.shape)
print("Non-zeros (total 1s):", X.nnz)


One-hot shape (orders x products): (14132, 14304)
Non-zeros (total 1s): 142555


تبدیل به pandas SparseDataFrame برای mlxtend

In [11]:

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

basket_onehot = pd.DataFrame.sparse.from_spmatrix(
    X,
    index=order_ids,
    columns=product_ids
)


basket_onehot = basket_onehot.astype("bool")

print("basket_onehot shape:", basket_onehot.shape)
basket_onehot.iloc[:5, :5]


basket_onehot shape: (14132, 14304)


Unnamed: 0,17889,9292,15424,18988,38959
64,True,True,True,True,True
176,False,False,False,False,False
178,False,False,False,False,False
504,False,False,False,False,False
506,False,False,False,False,False


Sanity Checks

In [12]:

density = X.nnz / (X.shape[0] * X.shape[1])
print("Density:", round(density, 6))


row_sums = np.asarray(X.sum(axis=1)).ravel()
print("Row sum min:", int(row_sums.min()))
print("Row sum max:", int(row_sums.max()))
print("Row sum mean:", float(row_sums.mean()))


Density: 0.000705
Row sum min: 2
Row sum max: 74
Row sum mean: 10.087390319841495
