In [16]:
import pandas as pd

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

# 1. 샘플 데이터셋을 생성

In [17]:
dataset = [['Onion', 'Potato', 'Burger'],
           ['Potato', 'Burger', 'Milk'],
           ['Milk', 'Beer'],
           ['Potato', 'Milk'],
           ['Onion', 'Potato', 'Burger', 'Beer'],
           ['Onion', 'Potato', 'Burger', 'Milk']
          ]

In [19]:
dataset

[['Onion', 'Potato', 'Burger'],
 ['Potato', 'Burger', 'Milk'],
 ['Milk', 'Beer'],
 ['Potato', 'Milk'],
 ['Onion', 'Potato', 'Burger', 'Beer'],
 ['Onion', 'Potato', 'Burger', 'Milk']]

# 2.  아래 코드는 아래의 표 데이터 순서대로 포함되어 있으면 True, 없으면 False를 저장한 값으로 인코딩함
|Beer|Burger|Milk|Onion|Potato|
|:--:|:--:|:--:|:--:|:--:|
|3행, 5행|1행, 2행, 5행, 6행|2행, 3행, 4행, 6행|1행, 5행, 6행|1행, 2행, 4행, 5행, 6행|

In [21]:
encode = TransactionEncoder()

encoded_array = encode.fit(dataset).transform(dataset)
encoded_array

array([[False,  True, False,  True,  True],
       [False,  True,  True, False,  True],
       [ True, False,  True, False, False],
       [False, False,  True, False,  True],
       [ True,  True, False,  True,  True],
       [False,  True,  True,  True,  True]])

# 3. 인코딩된 결과를 데이터프레임으로 변환, 이때 열 이름은 각각 인코딩할 때 사용된 컬럼명

In [22]:
dataframe = pd.DataFrame(encoded_array, columns=encode.columns_)
dataframe

Unnamed: 0,Beer,Burger,Milk,Onion,Potato
0,False,True,False,True,True
1,False,True,True,False,True
2,True,False,True,False,False
3,False,False,True,False,True
4,True,True,False,True,True
5,False,True,True,True,True


# 4. 최소 지지도 값이 0.4가 넘는 값만을 사용해서 빈번항목집합을 생성
- support = P(A∩B)
- A와 B가 동시에 일어난 횟수 / 전체 거래 횟수

In [23]:
frequent_itemsets = apriori(dataframe, min_support=0.4, use_colnames=True)
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.666667,(Burger)
1,0.666667,(Milk)
2,0.5,(Onion)
3,0.833333,(Potato)
4,0.5,"(Onion, Burger)"
5,0.666667,"(Potato, Burger)"
6,0.5,"(Potato, Milk)"
7,0.5,"(Potato, Onion)"
8,0.5,"(Potato, Onion, Burger)"


# 5. confidence 값이 최소 0.7 이상인 값만 pattern_rules에 담아서 이를 출력
- confidence = P(A∩B) / P(A)
- A와 B가 동시에 일어난 횟수 / A가 일어난 횟수

In [24]:
pattern_rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7)
pattern_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Onion),(Burger),0.5,0.666667,0.5,1.0,1.5,0.166667,inf,0.666667
1,(Burger),(Onion),0.666667,0.5,0.5,0.75,1.5,0.166667,2.0,1.0
2,(Potato),(Burger),0.833333,0.666667,0.666667,0.8,1.2,0.111111,1.666667,1.0
3,(Burger),(Potato),0.666667,0.833333,0.666667,1.0,1.2,0.111111,inf,0.5
4,(Milk),(Potato),0.666667,0.833333,0.5,0.75,0.9,-0.055556,0.666667,-0.25
5,(Onion),(Potato),0.5,0.833333,0.5,1.0,1.2,0.083333,inf,0.333333
6,"(Potato, Onion)",(Burger),0.5,0.666667,0.5,1.0,1.5,0.166667,inf,0.666667
7,"(Potato, Burger)",(Onion),0.666667,0.5,0.5,0.75,1.5,0.166667,2.0,1.0
8,"(Onion, Burger)",(Potato),0.5,0.833333,0.5,1.0,1.2,0.083333,inf,0.333333
9,(Onion),"(Potato, Burger)",0.5,0.666667,0.5,1.0,1.5,0.166667,inf,0.666667


# Text Mining Practice

In [37]:
import pandas as pd 
import numpy as np
from pandas import DataFrame
from tqdm import tqdm
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [38]:
basket = pd.read_csv('./data/Market_Basket_Optimisation.csv')
basket.head()

Unnamed: 0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
0,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
1,chutney,,,,,,,,,,,,,,,,,,,
2,turkey,avocado,,,,,,,,,,,,,,,,,,
3,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,
4,low fat yogurt,,,,,,,,,,,,,,,,,,,


## 1. Converting the data frame into a list of lists

In [39]:
records = [[str(basket.values[i, j]) for j in range(20)] for i in tqdm(range(7501), desc="Processing records")]

Processing records: 100%|██████████████████▉| 7499/7501 [09:47<00:00, 14.44it/s]

IndexError: index 7500 is out of bounds for axis 0 with size 7500

### explanation

## 2. Encoding and transforming back to data frame

In [40]:
encode = TransactionEncoder()
encoded_array = encode.fit(records).transform(records)

data_frame = pd.DataFrame(encoded_array, columns = encode.columns_)
data_frame

Unnamed: 0,asparagus,almonds,antioxydant juice,asparagus.1,avocado,babies food,bacon,barbecue sauce,black tea,blueberries,...,toothpaste,turkey,vegetables mix,water spray,white wine,whole wheat pasta,whole wheat rice,yams,yogurt cake,zucchini
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,True,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
742,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
743,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
744,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
745,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


### explanation

## 3. Drop missing values

In [41]:
basket_clean = data_frame.drop(['nan'], axis = 1)
basket_clean

Unnamed: 0,asparagus,almonds,antioxydant juice,asparagus.1,avocado,babies food,bacon,barbecue sauce,black tea,blueberries,...,toothpaste,turkey,vegetables mix,water spray,white wine,whole wheat pasta,whole wheat rice,yams,yogurt cake,zucchini
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,True,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
742,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
743,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
744,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
745,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


### explanation

## 4. Train with Apriori

In [42]:
frequent_itemsets = apriori(basket_clean, min_support = 0.04, use_colnames = True)
frequent_itemsets.head()

Unnamed: 0,support,itemsets
0,0.080321,(burgers)
1,0.072289,(cake)
2,0.042838,(champagne)
3,0.06158,(chicken)
4,0.187416,(chocolate)


### explanation

## 5. Creating rules

In [44]:
pattern_rules = association_rules(frequent_itemsets, metric = 'confidence', min_threshold = 0.3)
pattern_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(milk),(chocolate),0.124498,0.187416,0.045515,0.365591,1.950691,0.022182,1.280852,0.556665
1,(chocolate),(mineral water),0.187416,0.261044,0.066934,0.357143,1.368132,0.01801,1.149487,0.331137
2,(french fries),(eggs),0.157965,0.230254,0.05087,0.322034,1.398601,0.014498,1.135375,0.338465
3,(eggs),(mineral water),0.230254,0.261044,0.069612,0.302326,1.15814,0.009505,1.05917,0.177391
4,(ground beef),(mineral water),0.10174,0.261044,0.045515,0.447368,1.713765,0.018957,1.337158,0.463663
5,(milk),(mineral water),0.124498,0.261044,0.049531,0.397849,1.524069,0.017032,1.227194,0.39276
6,(spaghetti),(mineral water),0.180723,0.261044,0.064257,0.355556,1.362051,0.01708,1.146656,0.324449


### explanation