### Associating rule learning

Task:

- Add all pairs and parameters to a new df
- Define Support, Confidence and Lift of all pairs of products
- Find TOP-3 products
- Find TOP-3 pairs of products
- Save file to excel


In [54]:
import numpy as np
import pandas as pd
from apyori import apriori
from sklearn import set_config
set_config(print_changed_only=False)

In [55]:
df_Groceries = pd.read_csv('Groceries.csv', header=None)

df_Groceries.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,citrus fruit,semi-finished bread,margarine,ready soups,,,,,,,...,,,,,,,,,,
1,tropical fruit,yogurt,coffee,,,,,,,,...,,,,,,,,,,
2,whole milk,,,,,,,,,,...,,,,,,,,,,
3,pip fruit,yogurt,cream cheese,meat spreads,,,,,,,...,,,,,,,,,,
4,other vegetables,whole milk,condensed milk,long life bakery product,,,,,,,...,,,,,,,,,,


In [56]:
df_Groceries.shape

(9835, 32)

In [57]:
data = []
for i in range(df_Groceries.shape[0]):
    data.append([str(df_Groceries.values[i,j]) for j in range(df_Groceries.shape[1]) if pd.notna(df_Groceries.values[i, j])])
    
    

In [74]:
#max_length = 2, определяет что максим нам показывают пары, а min_length = 2 , показывает что мы ищем топ от 2 (не меньшечем пар)

association_rules = apriori(data, min_support=0.0045, min_confidence=0.2, min_lift=3, max_length=2)
association_results = list(association_rules)
#association_results 

In [59]:
# create dict for a new df from the list of rules

dict_ = {}

i = 0

for item in association_results:
    #pair = item[0]
    #items = [x for x in pair]
    dict_[i] = [ item[0], item[1], item[2][0][0], item[2][0][3] ]
    i = i +1
    
dict_    
    

{0: [frozenset({'baking powder', 'whipped/sour cream'}),
  0.004575495678698526,
  frozenset({'baking powder'}),
  3.607850330154072],
 1: [frozenset({'beef', 'root vegetables'}),
  0.017386883579054397,
  frozenset({'beef'}),
  3.0403668431100312],
 2: [frozenset({'berries', 'whipped/sour cream'}),
  0.009049313675648195,
  frozenset({'berries'}),
  3.796885505454703],
 3: [frozenset({'bottled beer', 'liquor'}),
  0.004677173360447382,
  frozenset({'liquor'}),
  5.240594013529793],
 4: [frozenset({'bottled beer', 'red/blush wine'}),
  0.004880528723945094,
  frozenset({'red/blush wine'}),
  3.1537598204264876],
 5: [frozenset({'flour', 'sugar'}),
  0.00498220640569395,
  frozenset({'flour'}),
  8.46311223504206],
 6: [frozenset({'herbs', 'root vegetables'}),
  0.007015760040671073,
  frozenset({'herbs'}),
  3.956477378731343],
 7: [frozenset({'sausage', 'sliced cheese'}),
  0.007015760040671073,
  frozenset({'sliced cheese'}),
  3.047434930215013]}

In [60]:
# create DataFrame 

df = pd.DataFrame(dict_, index =['Pair', 'Support', 'Confidence', 'Lift'])
df


Unnamed: 0,0,1,2,3,4,5,6,7
Pair,"(whipped/sour cream, baking powder)","(beef, root vegetables)","(berries, whipped/sour cream)","(liquor, bottled beer)","(bottled beer, red/blush wine)","(sugar, flour)","(herbs, root vegetables)","(sliced cheese, sausage)"
Support,0.0045755,0.0173869,0.00904931,0.00467717,0.00488053,0.00498221,0.00701576,0.00701576
Confidence,(baking powder),(beef),(berries),(liquor),(red/blush wine),(flour),(herbs),(sliced cheese)
Lift,3.60785,3.04037,3.79689,5.24059,3.15376,8.46311,3.95648,3.04743


In [61]:
# create Transpose

df = pd.DataFrame(dict_, index =['Pair', 'Support', 'Confidence', 'Lift']).T
df



Unnamed: 0,Pair,Support,Confidence,Lift
0,"(whipped/sour cream, baking powder)",0.0045755,(baking powder),3.60785
1,"(beef, root vegetables)",0.0173869,(beef),3.04037
2,"(berries, whipped/sour cream)",0.00904931,(berries),3.79689
3,"(liquor, bottled beer)",0.00467717,(liquor),5.24059
4,"(bottled beer, red/blush wine)",0.00488053,(red/blush wine),3.15376
5,"(sugar, flour)",0.00498221,(flour),8.46311
6,"(herbs, root vegetables)",0.00701576,(herbs),3.95648
7,"(sliced cheese, sausage)",0.00701576,(sliced cheese),3.04743


In [62]:
# TOP-3 pairs of products 

df.sort_values('Support', ascending=False, inplace=True)
df.head(3)


Unnamed: 0,Pair,Support,Confidence,Lift
1,"(beef, root vegetables)",0.0173869,(beef),3.04037
2,"(berries, whipped/sour cream)",0.00904931,(berries),3.79689
6,"(herbs, root vegetables)",0.00701576,(herbs),3.95648


In [72]:
# make array from datafraime

new = df_Groceries.to_numpy().reshape(-1)
new

array(['citrus fruit', 'semi-finished bread', 'margarine', ..., nan, nan,
       nan], dtype=object)

In [71]:
# The function to return a new Series with missing values removed.

ser = pd.Series(new)
ser.dropna(inplace=True)
ser

In [73]:
# Find TOP-3 products

ser.value_counts()[:3]

whole milk          2513
other vegetables    1903
rolls/buns          1809
dtype: int64

In [75]:
# save file to excel

ser.to_excel('output1.xlsx', engine='xlsxwriter')  