# Import Library

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

#Persiapan Data#

In [None]:
data=pd.read_excel('data_transaksi_retail.xlsx')

In [None]:
data

In [None]:
data_basket = (data.groupby(['Kode Transaksi', 'Nama Barang'])['Quantity'].count()\
                                      .unstack().reset_index().fillna(0)\
                                      .set_index('Kode Transaksi'))

In [None]:
data_basket

In [None]:
# Melakukan proses encoding -> Mengubah data kebentuk angka, agar sistem atau komputer dapat memahami informasi dari dataset
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1

data_sets = data_basket.applymap(encode_units)
data_sets

#Memahami Metrics

In [None]:
frequent_itemsets = apriori(data_sets, min_support=0.01, use_colnames=True)
frequent_itemsets

In [None]:
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0)
rules

In [None]:
frequent_itemsets['k'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))

## Metric Support

In [None]:
frequent_itemsets.sort_values('support')

In [None]:
rules.sort_values('support')

In [None]:
sns.scatterplot('support','k', data=frequent_itemsets)

semakin tinggi nilai support, nilai k semakin kecil

In [None]:
sns.distplot(frequent_itemsets['support'])

In [None]:
sns.distplot(rules['support'])

semakin tinggi nilai support, variasi rules yang diberikan akan semakin kecil

## Metric Confidence

In [None]:
rules.sort_values('confidence')

In [None]:
# confidence vs support
sns.scatterplot('confidence','support', data=rules)

## Metric Lift

In [None]:
rules.sort_values('lift')

In [None]:
# lift vs support
sns.scatterplot('lift','support', data=rules)

In [None]:
# confidence vs lift
sns.scatterplot('confidence','lift', data=rules)

## Metric Leverage

In [None]:
rules.sort_values('leverage')

In [None]:
# lift vs leverage
sns.scatterplot('lift','leverage', data=rules)

# Pemilihan Association Rules

## Melihat karakteristik dari pemilihan nilai minimum support

Kita akan melihat seperti apa kumpulan frequent itemset yang dihasilkan untuk berbagai nilai minimum support

In [None]:
frequent_itemsets = apriori(data_sets, min_support=0.01, use_colnames=True)
frequent_itemsets

In [None]:
frequent_itemsets = apriori(data_sets, min_support=0.05, use_colnames=True)
frequent_itemsets

In [None]:
frequent_itemsets = apriori(data_sets, min_support=0.1, use_colnames=True)
frequent_itemsets

In [None]:
# Jika nilai min_support terlalu kecil.
# Ketika di compile akan memakan waktu yang lama, karena akan menampilkan frequent itemsets yang sangat banyak
frequent_itemsets = apriori(data_sets, min_support=0.001, use_colnames=True)
frequent_itemsets

In [None]:
# Melihat pola
df_support = pd.DataFrame()
minsup_list = [0.01, 0.03, 0.05, 0.07, 0.1, 0.2]
for minsup in minsup_list:
  frequent_itemsets = apriori(data_sets, min_support=minsup, use_colnames=True)
  frequent_itemsets['k'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
  df_freq = frequent_itemsets.value_counts('k').reset_index().sort_values('k')
  df_freq.columns = ['k','row']

  dat = {}
  dat['min_support'] = minsup
  dat['row'] = frequent_itemsets.shape[0]
  for idx, row in df_freq.iterrows():
    k = row['k']
    dat[str(k)+"-itemset"] = row['row']
  df_support = df_support.append(dat, ignore_index=True)
df_support = df_support.fillna(0).astype(int)
df_support['min_support'] = minsup_list
df_support

Semakin kecil nilai minimum support, akan semakin banyak variasi frequent itemset yang dapat ditemukan.

Berarti, akan semakin banyak pula association rules yang bisa didapatkan

## Menentukan kriteria nilai support dan confidence

In [None]:
# kita gunakan min_support = 0.01 (ambil yang kecil, tapi output masih berhasil muncul)
frequent_itemsets = apriori(data_sets, min_support=0.01, use_colnames=True)

In [None]:
# kita gunakan minimum confidence = 0.5 (ambil yang kecil, tapi confidence masih dapat diterima)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)

In [None]:
# filter rules dengan lift > 1
rules = rules[rules['lift'] > 1]

In [None]:
rules

In [None]:
# cek scatterplot
sns.scatterplot('support','confidence', data=rules)

In [None]:
# filter rules dengan support >= 0.1 dan confidence >= 0.6
selected_rules = rules[(rules['support'] >= 0.1) & (rules['confidence'] >= 0.6)]

In [None]:
selected_rules.sort_values('confidence', ascending=False)

In [None]:
selected_rules.sort_values('support', ascending=False)

Kita temukan bahwa Shampo Biasa dan Serum Vitamin sering menjadi consequents dengan confidence yang tinggi. Insight ini bisa kita manfaatkan untuk berbagai strategi promosi.

## Analisis terhadap suatu produk tertentu

In [None]:
frequent_itemsets = apriori(data_sets, min_support=0.01, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)

Misal kita ingin mencari suatu frequent items dan association rules yang berkaitan dengan produk Hair Dryer

In [None]:
frequent_itemsets_hairdryer = frequent_itemsets[frequent_itemsets['itemsets'].apply(lambda x: 'Hair Dryer' in x)]
frequent_itemsets_hairdryer.sort_values('support', ascending=False)

In [None]:
# Hair Dryer sebagai antecedents
rules_hairdryer_1 = rules[rules['antecedents'].apply(lambda x: 'Hair Dryer' in x)]
rules_hairdryer_1.sort_values('support', ascending=False)

In [None]:
# Hair Dryer sebagai consequents
rules_hairdryer_1 = rules[rules['consequents'].apply(lambda x: 'Hair Dryer' in x)]
rules_hairdryer_1.sort_values('support', ascending=False)

# Visualisasi Output

In [None]:
frequent_itemsets = apriori(data_sets, min_support=0.1, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.6)
rules = rules[rules['lift'] > 1]

In [None]:
from wordcloud import WordCloud
plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = (15, 15)

In [None]:
str(frequent_itemsets['itemsets'])

In [None]:
wordcloud = WordCloud(background_color = 'white', width = 1200,  height = 1200, max_words = 121).generate(str(data['Nama Barang']))
plt.imshow(wordcloud)
plt.axis('off')
plt.title('Most Popular Items',fontsize = 20)
plt.show()

In [None]:
rules

In [None]:
list(rules)