# 확률과 통계

연관 규칙 분석의 실습을 위해 transaction으로 표시하기

In [3]:
import mlxtend
from mlxtend.frequent_patterns import apriori, association_rules

In [4]:
#리스트 형태로 트랜잭션 생성
dataset = [['Milk', 'Cookie', 'Apple', 'Beans', 'Eggs', 'Yogurt'],
           ['Coke', 'Cookie', 'Apple', 'Beans', 'Eggs', 'Yogurt'],
           ['Milk', 'Apple', 'Kidney Beans', 'Eggs'],
           ['Milk', 'Orange', 'Corn', 'Beans', 'Yogurt'],
           ['Corn', 'Cookie', 'Cookie', 'Beans', 'Ice cream', 'Eggs']]
type(dataset)

list

판다스의 데이터프레임 형태로 트랜잭션 생성

In [7]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder

te = TransactionEncoder()             #트랜재션 생성을 위해 Encoder 객체 생성
te_ary = te.fit(dataset).transform(dataset)
te_ary

array([[ True,  True, False,  True, False,  True, False, False,  True,
        False,  True],
       [ True,  True,  True,  True, False,  True, False, False, False,
        False,  True],
       [ True, False, False, False, False,  True, False,  True,  True,
        False, False],
       [False,  True, False, False,  True, False, False, False,  True,
         True,  True],
       [False,  True, False,  True,  True,  True,  True, False, False,
        False, False]])

In [8]:
#데이터 프레임으로 생성
df = pd.DataFrame(te_ary, columns = te.columns_)
df

Unnamed: 0,Apple,Beans,Coke,Cookie,Corn,Eggs,Ice cream,Kidney Beans,Milk,Orange,Yogurt
0,True,True,False,True,False,True,False,False,True,False,True
1,True,True,True,True,False,True,False,False,False,False,True
2,True,False,False,False,False,True,False,True,True,False,False
3,False,True,False,False,True,False,False,False,True,True,True
4,False,True,False,True,True,True,True,False,False,False,False


mlxtend의 Apriori 알고리즘 적용하기

In [9]:
import mlxtend
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

apriori(df, min_support=0.6)

Unnamed: 0,support,itemsets
0,0.6,(0)
1,0.8,(1)
2,0.6,(3)
3,0.8,(5)
4,0.6,(8)
5,0.6,(10)
6,0.6,"(0, 5)"
7,0.6,"(1, 3)"
8,0.6,"(1, 5)"
9,0.6,"(1, 10)"


In [10]:
# itemsets에 제품명이 나오도록 지정
apriori(df, min_support=0.6, use_colnames=True)

Unnamed: 0,support,itemsets
0,0.6,(Apple)
1,0.8,(Beans)
2,0.6,(Cookie)
3,0.8,(Eggs)
4,0.6,(Milk)
5,0.6,(Yogurt)
6,0.6,"(Apple, Eggs)"
7,0.6,"(Cookie, Beans)"
8,0.6,"(Beans, Eggs)"
9,0.6,"(Beans, Yogurt)"


연관 규칙 분석 결과를 필터링하기

In [12]:
#itemsets의 제품 개수에 따른 필터링을 위해 length 열 추가
#결과를 frequent_itemsets라는 이름으로 생성
frequent_itemsets = apriori(df, min_support=0.6, use_colnames=True)
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
frequent_itemsets

Unnamed: 0,support,itemsets,length
0,0.6,(Apple),1
1,0.8,(Beans),1
2,0.6,(Cookie),1
3,0.8,(Eggs),1
4,0.6,(Milk),1
5,0.6,(Yogurt),1
6,0.6,"(Apple, Eggs)",2
7,0.6,"(Cookie, Beans)",2
8,0.6,"(Beans, Eggs)",2
9,0.6,"(Beans, Yogurt)",2


In [13]:
#frequent_itemsets에서 특정 조건을 만족시키는 행 선택
frequent_itemsets[(frequent_itemsets['length']==2) & (frequent_itemsets['support']>=0.6)]

Unnamed: 0,support,itemsets,length
6,0.6,"(Apple, Eggs)",2
7,0.6,"(Cookie, Beans)",2
8,0.6,"(Beans, Eggs)",2
9,0.6,"(Beans, Yogurt)",2
10,0.6,"(Cookie, Eggs)",2


연관 규칙 분석 결과에서 confidence, lift 등 조건에 맞는 패턴 찾기

In [14]:
# association_rules 함수 : 데이터프레임을 입력, 기본값(Confidence), 향상도(lift) 등을 조건으로 사용
# Apriori 결과에서 패턴을 발견, 이때 최소 신뢰도는 0.7로 지정

rules = association_rules(frequent_itemsets, metric = "confidence", min_threshold=0.7)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Apple),(Eggs),0.6,0.8,0.6,1.0,1.25,0.12,inf
1,(Eggs),(Apple),0.8,0.6,0.6,0.75,1.25,0.12,1.6
2,(Cookie),(Beans),0.6,0.8,0.6,1.0,1.25,0.12,inf
3,(Beans),(Cookie),0.8,0.6,0.6,0.75,1.25,0.12,1.6
4,(Beans),(Eggs),0.8,0.8,0.6,0.75,0.9375,-0.04,0.8
5,(Eggs),(Beans),0.8,0.8,0.6,0.75,0.9375,-0.04,0.8
6,(Beans),(Yogurt),0.8,0.6,0.6,0.75,1.25,0.12,1.6
7,(Yogurt),(Beans),0.6,0.8,0.6,1.0,1.25,0.12,inf
8,(Cookie),(Eggs),0.6,0.8,0.6,1.0,1.25,0.12,inf
9,(Eggs),(Cookie),0.8,0.6,0.6,0.75,1.25,0.12,1.6


In [15]:
# 향상도가 1.2 이상인 패턴을 발견
rules2 = association_rules(frequent_itemsets, metric = "lift", min_threshold = 1.2)
rules2

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Apple),(Eggs),0.6,0.8,0.6,1.0,1.25,0.12,inf
1,(Eggs),(Apple),0.8,0.6,0.6,0.75,1.25,0.12,1.6
2,(Cookie),(Beans),0.6,0.8,0.6,1.0,1.25,0.12,inf
3,(Beans),(Cookie),0.8,0.6,0.6,0.75,1.25,0.12,1.6
4,(Beans),(Yogurt),0.8,0.6,0.6,0.75,1.25,0.12,1.6
5,(Yogurt),(Beans),0.6,0.8,0.6,1.0,1.25,0.12,inf
6,(Cookie),(Eggs),0.6,0.8,0.6,1.0,1.25,0.12,inf
7,(Eggs),(Cookie),0.8,0.6,0.6,0.75,1.25,0.12,1.6
8,"(Cookie, Beans)",(Eggs),0.6,0.8,0.6,1.0,1.25,0.12,inf
9,"(Cookie, Eggs)",(Beans),0.6,0.8,0.6,1.0,1.25,0.12,inf


In [18]:
# 결과 중 필터링
# antecedents 열에 각 값에 포함된 item 개수를 len() 함수로 측정하여 새로운 열을 생성
rules['antecedent_len'] = rules['antecedents'].apply(lambda x:len(x))
rules[(rules['antecedent_len']>=2)&(rules['confidence']>0.75)&(rules['lift']>1.2)]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len
10,"(Cookie, Beans)",(Eggs),0.6,0.8,0.6,1.0,1.25,0.12,inf,2
11,"(Cookie, Eggs)",(Beans),0.6,0.8,0.6,1.0,1.25,0.12,inf,2
12,"(Beans, Eggs)",(Cookie),0.6,0.6,0.6,1.0,1.666667,0.24,inf,2
