In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules, fpgrowth

import networkx as nx

Загрузка данных

In [None]:
df = pd.read_csv('/mnt/data/Market_Basket_Optimisation.csv', header=None)
df.head()

# Преобразуем CSV в список транзакций
transactions = df.values.tolist()
transactions = [[item for item in row if isinstance(item, str)] for row in transactions]

print("Пример транзакции:", transactions[0])
print("Всего транзакций:", len(transactions))

Описание данных

In [None]:
transaction_lengths = [len(t) for t in transactions]

plt.figure(figsize=(8,5))
plt.hist(transaction_lengths, bins=range(1, max(transaction_lengths)+2))
plt.title("Распределение длин транзакций")
plt.xlabel("Число товаров")
plt.ylabel("Частота")
plt.show()

# Уникальные товары
unique_items = sorted({item for t in transactions for item in t})
print("Количество уникальных товаров:", len(unique_items))
print("Первые 20 товаров:", unique_items[:20])

One hot Encoding

In [None]:
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
data = pd.DataFrame(te_ary, columns=te.columns_)

print("Размер бинарной матрицы:", data.shape)
data.head()

Apriori

In [None]:
min_support_ap = 0.02

frequent_ap = apriori(data, min_support=min_support_ap, use_colnames=True)
frequent_ap['length'] = frequent_ap['itemsets'].apply(len)

print("Частые наборы (Apriori):", len(frequent_ap))
frequent_ap.head()

rules_ap = association_rules(frequent_ap, metric='confidence', min_threshold=0.3)
rules_ap.head()

Анализ лучших правил Apriori

In [None]:
rules_ap['antecedents_str'] = rules_ap['antecedents'].apply(lambda x: ', '.join(list(x)))
rules_ap['consequents_str'] = rules_ap['consequents'].apply(lambda x: ', '.join(list(x)))

print("ТОП-10 правил по лифту:")
rules_ap.sort_values(by='lift', ascending=False).head(10)

FPG

In [None]:
min_support_fpg = 0.02

frequent_fpg = fpgrowth(data, min_support=min_support_fpg, use_colnames=True)
frequent_fpg['length'] = frequent_fpg['itemsets'].apply(len)

print("Частые наборы (FPGrowth):", len(frequent_fpg))

rules_fpg = association_rules(frequent_fpg, metric='confidence', min_threshold=0.3)
rules_fpg['antecedents_str'] = rules_fpg['antecedents'].apply(lambda x: ', '.join(list(x)))
rules_fpg['consequents_str'] = rules_fpg['consequents'].apply(lambda x: ', '.join(list(x)))

print("ТОП-10 правил по лифту (FPGrowth):")
rules_fpg.sort_values(by='lift', ascending=False).head(10)

Минимальная поддержка наборов разной длины

In [None]:
frequent_low = fpgrowth(data, min_support=0.005, use_colnames=True)
frequent_low['length'] = frequent_low['itemsets'].apply(len)

min_support_by_len = (
    frequent_low.groupby('length')['support'].min().reset_index().sort_values('length')
)

print(min_support_by_len)

plt.plot(min_support_by_len['length'], min_support_by_len['support'], marker='o')
plt.title("Минимальная поддержка по длине наборов")
plt.xlabel("Длина набора")
plt.ylabel("Минимальная поддержка")
plt.grid()
plt.show()

Эксперименты с параметрами

In [None]:
support_values = [0.01, 0.02, 0.03, 0.05]
conf_values = [0.3, 0.5, 0.7]

results = []

for s in support_values:
    freq = fpgrowth(data, min_support=s, use_colnames=True)
    for c in conf_values:
        r = association_rules(freq, metric='confidence', min_threshold=c)
        results.append({
            'min_support': s,
            'min_conf': c,
            'n_itemsets': len(freq),
            'n_rules': len(r)
        })

results_df = pd.DataFrame(results)
print(results_df)

plt.figure(figsize=(8,5))
for c in conf_values:
    sub = results_df[results_df['min_conf'] == c]
    plt.plot(sub['min_support'], sub['n_rules'], marker='o', label=f'conf={c}')

plt.legend()
plt.xlabel("min_support")
plt.ylabel("Число правил")
plt.title("Зависимость количества правил от параметров")
plt.grid()
plt.show()

Граф

In [None]:
rules_graph = rules_fpg.sort_values(by='confidence', ascending=False).head(20)

G = nx.DiGraph()

for _, row in rules_graph.iterrows():
    a = row['antecedents_str']
    b = row['consequents_str']
    c = row['confidence']
    G.add_edge(a, b, weight=c)

plt.figure(figsize=(12,8))
pos = nx.spring_layout(G, k=0.4)

nx.draw(G, pos, with_labels=True, node_color='lightgreen', node_size=2500, font_size=8, arrows=True)
edge_labels = nx.get_edge_attributes(G, 'weight')
edge_labels = {k: f"{v:.2f}" for k,v in edge_labels.items()}
nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=8)

plt.title("Граф ассоциативных правил (FPGrowth)")
plt.show()

Своя визуализация

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(rules_fpg['support'], rules_fpg['confidence'], s=rules_fpg['lift']*20, alpha=0.6)

plt.xlabel("Поддержка")
plt.ylabel("Достоверность")
plt.title("Support vs Confidence (размер точки = Lift)")
plt.grid()
plt.show()