# Association Analysis

## Import all necessary libraries

In [1]:
#!fc-list :lang=zh family

In [2]:
import os
import glob
from functools import reduce
#import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#from scipy import stats
from sklearn.cluster import KMeans
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

#pd.set_option('display.max_rows', 10)
plt.rcParams['font.sans-serif'] = ['Noto Sans Mono CJK TC', 'sans-serif'] 
plt.rcParams['axes.unicode_minus'] = False

%matplotlib inline

## Load Data

In [3]:
try:
    from google.colab import drive

    # Mount the folder "drive" on google drive to Colab Notebook
    drive.mount('/content/drive')
    path = '/content/drive/My Drive/wids-taipei/2020-WiDS-Taipei-MLCC-Workshop/dataset/*.csv'
except ModuleNotFoundError:
    path = '../data/*.csv'

### Read data

In [4]:
# Read data
filenames = glob.glob(path)
pd_dict = {}

for filename in filenames:
    name = filename.split("/")[-1].split(".")[0]
    pd_dict[name] = pd.read_csv(os.path.join(filename))

purchase_data = pd_dict['customer_purchase_dataset']    
payments_data = pd_dict['order_payments_dataset']
reviews_data = pd_dict['order_reviews_dataset']
orders_data = pd_dict['orders_dataset']
customers_data = pd_dict['customers_dataset']

## Association Analysis

In [5]:
def encode_units(x):
    if x <=0 :
        return 0
    else:
        return 1

# crosstab: compute a simple cross tabulation of two (or more) factors.
#           Default -> frequence
basket = pd.crosstab(purchase_data['customer_unique_id'], purchase_data['product_main_category'])
basket_sets = basket.applymap(encode_units)
basket_sets

product_main_category,3C,休閒生活,保健,其他,商業用途,嬰兒用品,安全配件,家居生活,家電,文具,書籍,服飾/配件,美食,藝術
customer_unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0000366f3b9a7992bf8c76cfdf3221e2,0,0,0,0,0,0,0,1,0,0,0,0,0,0
0000b849f77a49e4a4ce2b2a4ca5be3f,0,0,1,0,0,0,0,0,0,0,1,0,0,0
0000f46a3911fa3c0805444483337064,0,0,1,0,0,0,0,0,0,1,1,0,0,0
0000f6ccb0745a6a4b88665a16c9f078,0,0,0,0,0,0,0,1,1,0,0,0,0,0
0004aac84e0df4da2b147fca70cf8255,0,0,0,0,1,0,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
fffcf5a5ff07b0908bd4e2dbc735a684,0,0,1,0,0,0,0,0,0,0,1,0,1,0
fffea47cd6d3cc0a88bd621562a9d061,0,0,0,0,0,1,0,0,0,0,0,0,0,0
ffff371b4d645b6ecea244b27531430a,1,0,1,0,0,0,0,0,0,0,1,0,1,0
ffff5962728ec6157033ef9805bacc48,0,1,0,0,0,0,0,0,0,0,0,0,0,0


In [6]:
frequent_itemsets = apriori(basket_sets, min_support=0.05, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="lift")
rules = rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']]

# support(x, y) = number(x, y) / number(all samples)
rules['frequency'] = rules['support'] * len(basket_sets)
rules['length_1'] = rules['antecedents'].apply(lambda x: len(x))
rules['length_2'] = rules['consequents'].apply(lambda x: len(x))

rules_new = rules.loc[(rules['length_1'] == 1) & (rules['length_2'] == 1)]
rules_new["antecedents"] = rules_new["antecedents"].apply(lambda x: list(x)[0]).astype("unicode")
rules_new["consequents"] = rules_new["consequents"].apply(lambda x: list(x)[0]).astype("unicode")
rules_new

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


Unnamed: 0,antecedents,consequents,support,confidence,lift,frequency,length_1,length_2
0,休閒生活,3C,0.219646,0.63132,1.758847,20504.0,1,1
1,3C,休閒生活,0.219646,0.611932,1.758847,20504.0,1,1
2,書籍,3C,0.073583,0.327501,0.912412,6869.0,1,1
3,3C,書籍,0.073583,0.205002,0.912412,6869.0,1,1
4,服飾/配件,3C,0.104799,0.518992,1.445904,9783.0,1,1
5,3C,服飾/配件,0.104799,0.291969,1.445904,9783.0,1,1
6,美食,3C,0.055972,0.306956,0.855174,5225.0,1,1
7,3C,美食,0.055972,0.155938,0.855174,5225.0,1,1
8,書籍,休閒生活,0.071644,0.318871,0.916516,6688.0,1,1
9,休閒生活,書籍,0.071644,0.205924,0.916516,6688.0,1,1


In [7]:
cm = sns.light_palette((260, 75, 60), input="husl", as_cmap=True)

support_data = rules_new.groupby(['antecedents', 'consequents']).apply(lambda x: x.sort_values('support', ascending=False))
support_data = support_data[['support']].droplevel(2)
s = support_data.style.background_gradient(cmap=cm)
s

Unnamed: 0_level_0,Unnamed: 1_level_0,support
antecedents,consequents,Unnamed: 2_level_1
3C,休閒生活,0.219646
3C,書籍,0.0735833
3C,服飾/配件,0.104799
3C,美食,0.0559721
休閒生活,3C,0.219646
休閒生活,書籍,0.0716443
休閒生活,服飾/配件,0.168838
休閒生活,美食,0.0534762
保健,書籍,0.112608
保健,美食,0.0613498
