In [85]:
import time
import pandas as pd
import plotly.express as px
from mlxtend.frequent_patterns import apriori
from mlxtend.preprocessing import TransactionEncoder

In [86]:
def read_file(filename): # Функция чтения транзакций из файла
  with open(filename, 'r') as temp_f:
    col_count = [ len(l.split(" ")) for l in temp_f.readlines() ] # Подсчет максимального количества объектов в транзакции
  column_names = [i for i in range(0, max(col_count))]
  df = pd.read_csv(filename, sep = ' ', header=None, names=column_names)
  df_out = df.apply(lambda x: list(x.dropna().values), axis=1).tolist() # Создание массива транзакций
  transactionEncoder = TransactionEncoder() # Трансформация массива к нужному для алгоритма формату
  dataset = transactionEncoder.fit(df_out).transform(df_out)
  dataset = dataset.astype('int')
  df = pd.DataFrame(dataset, columns=transactionEncoder.columns_)
  return df # Возврат готового DataFrame для алгоритма Apriori

In [87]:
def find_result(df): # Функция экспериментирования над наборами данных
  min_supports = [] # Порог поддержки для каждого эксперимента
  items_count = [] # Количество наборов
  process_time = [] # Время работы
  for i in range(0,5):
    start = time.time()
    if len(df) <= 1000:
      min_support = i*0.05+0.05
      frequent_itemsets = apriori(df, min_support = min_support) # Запуск алгоритма Apriori
      min_supports.append(min_support*100)
    else:
      min_support = i*0.1+0.5
      frequent_itemsets = apriori(df, min_support = min_support)
      min_supports.append(min_support*100)
    end = time.time()
    items_count.append(frequent_itemsets['itemsets'].count())
    process_time.append(end-start)
  result = pd.DataFrame(dict(min_supports = min_supports, items_count = items_count, process_time = process_time)) 
  return result # Возврат результатов эксперимента над набором данных


In [88]:
def visualize_results(result): # Функция визуализации результатов с помощью библиотеки Plotly
  fig = px.bar(result, x = 'min_supports', y = 'process_time', labels = {'min_supports': 'Порог поддержки, %', 'process_time': 'Время выполнения, с'})
  fig.show()
  fig = px.bar(result, x = 'min_supports', y = 'items_count', labels = {'min_supports': 'Порог поддержки, %', 'items_count': 'Количество наборов'})
  fig.show()


In [89]:
df_retail = read_file('retail.dat') # Чтение файла
df_retail

Unnamed: 0,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,...,3172.0,3173.0,3174.0,3175.0,3176.0,3177.0,3178.0,3179.0,3180.0,3181.0
0,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
998,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,0,0,0


In [90]:
df_accidents = read_file('accidents.dat') # Чтение файла
df_accidents

Unnamed: 0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,...,252.0,253.0,254.0,255.0,256.0,257.0,258.0,259.0,260.0,261.0
0,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,1,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,1,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1494,1,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1495,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1496,1,1,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
1497,0,0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0


In [91]:
frequent_itemsets = apriori(df_retail, min_support=0.05) # Поиск частых наборов с помощью алгоритма Apriori
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.121,(32)
1,0.244,(38)
2,0.608,(39)
3,0.239,(41)
4,0.437,(48)
5,0.054,(170)
6,0.056,(1327)
7,0.068,"(32, 39)"
8,0.052,"(32, 48)"
9,0.154,"(38, 39)"


In [92]:
sorted_frequent_itemsets = frequent_itemsets.sort_values(by = 'support', ascending=False) # Сортировка результатов по порогу поддержки
sorted_frequent_itemsets

Unnamed: 0,support,itemsets
2,0.608,(39)
4,0.437,(48)
14,0.32,"(48, 39)"
1,0.244,(38)
3,0.239,(41)
13,0.187,"(41, 39)"
9,0.154,"(38, 39)"
15,0.129,"(48, 41)"
0,0.121,(32)
11,0.111,"(48, 38)"


In [93]:
result_retail = find_result(df_retail)
result_retail

Unnamed: 0,min_supports,items_count,process_time
0,5.0,19,0.117719
1,10.0,11,0.108012
2,15.0,7,0.107316
3,20.0,5,0.113739
4,25.0,3,0.126017


In [94]:
result_accidents = find_result(df_accidents)
result_accidents

Unnamed: 0,min_supports,items_count,process_time
0,50.0,9921,1.316516
1,60.0,2957,0.349925
2,70.0,847,0.089897
3,80.0,199,0.029754
4,90.0,31,0.012501


In [95]:
visualize_results(result_retail)

In [96]:
visualize_results(result_accidents)