In [16]:
import numpy as np
import pandas as pd
import warnings

from mlxtend.frequent_patterns import apriori

warnings.filterwarnings("ignore", category=FutureWarning)

print("Imports OK ✅")


Imports OK ✅


In [17]:
from scipy.sparse import coo_matrix

CLEAN_ORDER_PRODUCTS_PATH = "out/order_products_clean.csv"

op = pd.read_csv(CLEAN_ORDER_PRODUCTS_PATH, usecols=["order_id", "product_id"])
op["order_id"] = op["order_id"].astype("int32")
op["product_id"] = op["product_id"].astype("int32")

order_ids = op["order_id"].unique()
product_ids = op["product_id"].unique()

order_id_to_row = pd.Series(np.arange(len(order_ids), dtype=np.int32), index=order_ids)
product_id_to_col = pd.Series(np.arange(len(product_ids), dtype=np.int32), index=product_ids)

rows = order_id_to_row[op["order_id"]].to_numpy()
cols = product_id_to_col[op["product_id"]].to_numpy()
data = np.ones(len(op), dtype=np.bool_)

X = coo_matrix((data, (rows, cols)), shape=(len(order_ids), len(product_ids))).tocsr()

print("Rebuilt X ✅", X.shape, "nnz:", X.nnz)


Rebuilt X ✅ (14132, 14304) nnz: 142555


آماده‌سازی برای mlxtend

In [18]:


basket_onehot = pd.DataFrame.sparse.from_spmatrix(
    X,
    index=order_ids,
    columns=product_ids
).astype("bool")

print("Apriori input shape:", basket_onehot.shape)
print("Density:", round(X.nnz / (X.shape[0] * X.shape[1]), 6))


Apriori input shape: (14132, 14304)
Density: 0.000705


 Apriori با support_min = 0.05

In [19]:
freq_05 = apriori(
    basket_onehot,
    min_support=0.05,
    use_colnames=True,
    low_memory=True
)

freq_05["itemset_size"] = freq_05["itemsets"].apply(len).astype("int16")
freq_05 = freq_05.sort_values(["support", "itemset_size"], ascending=[False, False]).reset_index(drop=True)

print("Frequent itemsets (0.05):", len(freq_05))
print("Max itemset size (0.05):", int(freq_05["itemset_size"].max()) if len(freq_05) else 0)
freq_05.head(10)


Frequent itemsets (0.05): 6
Max itemset size (0.05): 1


Unnamed: 0,support,itemsets,itemset_size
0,0.151571,(24852),1
1,0.111166,(13176),1
2,0.08739,(21137),1
3,0.07338,(21903),1
4,0.062836,(47209),1
5,0.05109,(47626),1


Apriori با support_min = 0.01

In [20]:
freq_01 = apriori(
    basket_onehot,
    min_support=0.01,
    use_colnames=True,
    low_memory=True
)

freq_01["itemset_size"] = freq_01["itemsets"].apply(len).astype("int16")
freq_01 = freq_01.sort_values(["support", "itemset_size"], ascending=[False, False]).reset_index(drop=True)

print("Frequent itemsets (0.01):", len(freq_01))
print("Max itemset size (0.01):", int(freq_01["itemset_size"].max()) if len(freq_01) else 0)
freq_01.head(10)


Frequent itemsets (0.01): 122
Max itemset size (0.01): 2


Unnamed: 0,support,itemsets,itemset_size
0,0.151571,(24852),1
1,0.111166,(13176),1
2,0.08739,(21137),1
3,0.07338,(21903),1
4,0.062836,(47209),1
5,0.05109,(47626),1
6,0.047269,(26209),1
7,0.046703,(27845),1
8,0.045853,(16797),1
9,0.044367,(47766),1


جدول مقایسه

In [21]:
comparison = pd.DataFrame({
    "min_support": [0.05, 0.01],
    "num_itemsets": [len(freq_05), len(freq_01)],
    "max_itemset_size": [
        int(freq_05["itemset_size"].max()) if len(freq_05) else 0,
        int(freq_01["itemset_size"].max()) if len(freq_01) else 0
    ],
    "avg_itemset_size": [
        float(freq_05["itemset_size"].mean()) if len(freq_05) else 0,
        float(freq_01["itemset_size"].mean()) if len(freq_01) else 0
    ]
})
freq_01.to_pickle("out/freq_01.pkl")

comparison



Unnamed: 0,min_support,num_itemsets,max_itemset_size,avg_itemset_size
0,0.05,6,1,1.0
1,0.01,122,2,1.114754
