In [1]:
import heapq
from collections import defaultdict

import pandas as pd
import matplotlib.pyplot as plt
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split

# Rule Selection 실습

In [2]:
# Load and preprocess data set
fp_df = pd.read_csv('Faceplate.csv')
fp_df.set_index('Transaction',inplace = True)
fp_df

Unnamed: 0_level_0,Red,White,Blue,Orange,Green,Yellow
Transaction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,1,0,0,1,0
2,0,1,0,1,0,0
3,0,1,1,0,0,0
4,1,1,0,1,0,0
5,1,0,1,0,0,0
6,0,1,1,0,0,0
7,1,0,1,0,0,0
8,1,1,1,0,1,0
9,1,1,1,0,0,0
10,0,0,0,0,0,1


* **apriori 주요 파라미터**<br>

**df**: 정해진 format의 dataframe만을 받을 수 있다.<br>
**min_support** : 0~1사이의 minimum support 비율을 선정<br>
**use_colnames** : True로 설정하면 itemsets에 컬럼인덱스 대신에 컬럼이름을 사용한다.(Default = False)<br>
**max_len** : itemset의 최대길이를 설정<br>|
<br>
Return : pandas DataFrame with columns ['support', 'itemsets'] of all itemsets

In [3]:
# create frequent itemsets
itemsets = apriori(fp_df,min_support = 0.2, use_colnames = True)
itemsets



Unnamed: 0,support,itemsets
0,0.6,(Red)
1,0.7,(White)
2,0.6,(Blue)
3,0.2,(Orange)
4,0.2,(Green)
5,0.4,"(White, Red)"
6,0.4,"(Red, Blue)"
7,0.2,"(Green, Red)"
8,0.4,"(White, Blue)"
9,0.2,"(Orange, White)"


* **association_rules 주요 파라미터**<br>

**df**: ['support', 'itemsets'] 컬럼을 가진 데이터프레임<br>
**metric** : 규칙들을 Filtering하기 위한 기준을 선정(Default = 'confidence')<br>
**min_threshold** : 기준으로 사용할 Minimal Threshold를 지정(Default = 0.8)<br>
<br>
Return : pandas DataFrame with columns "antecedents" and "consequents"
that store itemsets, plus the scoring metric columns:
<br>
  "antecedent support", "consequent support",
  "support", "confidence", "lift",
  "leverage", "conviction"<br>
  of all rules for which
  metric(rule) >= min_threshold.

In [4]:
# convert into rules
rules = association_rules(itemsets, metric = 'confidence', min_threshold = 0.5)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(White),(Red),0.7,0.6,0.4,0.571429,0.952381,-0.02,0.933333,-0.142857
1,(Red),(White),0.6,0.7,0.4,0.666667,0.952381,-0.02,0.9,-0.111111
2,(Red),(Blue),0.6,0.6,0.4,0.666667,1.111111,0.04,1.2,0.25
3,(Blue),(Red),0.6,0.6,0.4,0.666667,1.111111,0.04,1.2,0.25
4,(Green),(Red),0.2,0.6,0.2,1.0,1.666667,0.08,inf,0.5
5,(White),(Blue),0.7,0.6,0.4,0.571429,0.952381,-0.02,0.933333,-0.142857
6,(Blue),(White),0.6,0.7,0.4,0.666667,0.952381,-0.02,0.9,-0.111111
7,(Orange),(White),0.2,0.7,0.2,1.0,1.428571,0.06,inf,0.375
8,(Green),(White),0.2,0.7,0.2,1.0,1.428571,0.06,inf,0.375
9,"(White, Red)",(Blue),0.4,0.6,0.2,0.5,0.833333,-0.04,0.8,-0.25


In [5]:
rules.sort_values(by=['lift'],ascending = False).head(6)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
14,"(White, Red)",(Green),0.4,0.2,0.2,0.5,2.5,0.12,1.6,1.0
15,(Green),"(White, Red)",0.2,0.4,0.2,1.0,2.5,0.12,inf,0.75
4,(Green),(Red),0.2,0.6,0.2,1.0,1.666667,0.08,inf,0.5
12,"(Green, White)",(Red),0.2,0.6,0.2,1.0,1.666667,0.08,inf,0.5
7,(Orange),(White),0.2,0.7,0.2,1.0,1.428571,0.06,inf,0.375
8,(Green),(White),0.2,0.7,0.2,1.0,1.428571,0.06,inf,0.375


In [6]:
print(fp_df)
print(rules.sort_values(by=['lift'],ascending = False)
      .drop(columns = ['antecedent support','consequent support','conviction','zhangs_metric']).head(6))

             Red  White  Blue  Orange  Green  Yellow
Transaction                                         
1              1      1     0       0      1       0
2              0      1     0       1      0       0
3              0      1     1       0      0       0
4              1      1     0       1      0       0
5              1      0     1       0      0       0
6              0      1     1       0      0       0
7              1      0     1       0      0       0
8              1      1     1       0      1       0
9              1      1     1       0      0       0
10             0      0     0       0      0       1
       antecedents   consequents  support  confidence      lift  leverage
14    (White, Red)       (Green)      0.2         0.5  2.500000      0.12
15         (Green)  (White, Red)      0.2         1.0  2.500000      0.12
4          (Green)         (Red)      0.2         1.0  1.666667      0.08
12  (Green, White)         (Red)      0.2         1.0  1.666667     

# Interpreting the Results
우리는 위에서 얻은 각각의 규칙들의 성능에 대한 정보를 해석해 볼 수 있다. <br>
예를 들어서,
**rule {orange} &rightarrow {white}** 는 다음과 같이 해석할 수 있다.<br>
> 만약 orange를 구매한다면, 100%의 신뢰도로 white도 구매될 것이다.<br>
또한 이 규칙은 1.43의 향상비를 갖는다.

결과를 해석하는데에 다양한 측정지표를 살피는 것은 도움이 된다. <br><br>
**Support**는 얼마나 많은 거래들이 규칙의 영향을 받았는지의 지표로써 활용할 수 있고, 만약 support가 작다면 그 규칙은 유용성이 떨어진다고 판단할 수 있다. 
<br><br>
**lift(향상비)**는 전체에서 결론부를 찾는 거에 비해 규칙을 적용했을때 결론부를 찾는 것이 얼마나 효율적인지를 보여준다.<br>
(이때, 지지도 또한 같이 고려해야 하는데, 매우 낮은 지지도를 갖는 매우 효율적인 규칙은 더 높은 지지도를 갖지만 좀 덜 효율적인 규칙보다 바람직하지 않을 수 있다.)
<br><br>
**confidence(신뢰도)**는 우리에게 조건부가 주어졌을때 결론부가 어느정도의 비율로 찾아지는지 말해주고, 이는 실질적인 유효성에 관한 정보를 제공해준다.<br>
(따라서, 신뢰도가 낮은 규칙은 거래에서 조건부에 해당하는 itemset이 거래됐을때 결론부에 해당하는 itemset을 promotion 할 가치가 없는 것을 의미한다.)


# Example 2: Rules for Similar Book Purchases

In [7]:
# load dataset
all_books_df = pd.read_csv("CharlesBookClub.csv")

# create the binary incidence matrix
ignore = ['Seq#','ID#','Gender','M','R','FirstPurch','Related Purchase','Mcode','Rcode','Fcode','Yes_Florence','No_Florence']
count_books = all_books_df.drop(columns = ignore)
# 거래된 책의 개수가 나오기에 1로 만들어준다.
count_books[count_books > 0] = 1

# create frequent itemsets and rules
itemsets = apriori(count_books,min_support = 200/4000, use_colnames = True)
rules = association_rules(itemsets,metric = "confidence", min_threshold = 0.5)

# Display 25 rules with highest lift
rules.sort_values(by=['lift'],ascending = False).head(25)



Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
263,"(YouthBks, RefBks, F)","(CookBks, ChildBks)",0.08125,0.242,0.05525,0.68,2.809917,0.035588,2.36875,0.70108
267,"(YouthBks, RefBks)","(CookBks, F, ChildBks)",0.08125,0.242,0.05525,0.68,2.809917,0.035588,2.36875,0.70108
231,"(YouthBks, RefBks)","(CookBks, ChildBks)",0.08125,0.242,0.05525,0.68,2.809917,0.035588,2.36875,0.70108
287,"(DoItYBks, RefBks)","(CookBks, F, ChildBks)",0.0925,0.242,0.06125,0.662162,2.736207,0.038865,2.24368,0.699207
240,"(DoItYBks, RefBks)","(CookBks, ChildBks)",0.0925,0.242,0.06125,0.662162,2.736207,0.038865,2.24368,0.699207
283,"(DoItYBks, RefBks, F)","(CookBks, ChildBks)",0.0925,0.242,0.06125,0.662162,2.736207,0.038865,2.24368,0.699207
253,"(DoItYBks, YouthBks, F)","(CookBks, ChildBks)",0.10325,0.242,0.067,0.64891,2.681448,0.042014,2.158993,0.699266
258,"(DoItYBks, YouthBks)","(CookBks, F, ChildBks)",0.10325,0.242,0.067,0.64891,2.681448,0.042014,2.158993,0.699266
227,"(DoItYBks, YouthBks)","(CookBks, ChildBks)",0.10325,0.242,0.067,0.64891,2.681448,0.042014,2.158993,0.699266
247,"(RefBks, GeogBks)","(CookBks, ChildBks)",0.08175,0.242,0.05025,0.614679,2.539995,0.030467,1.96719,0.660276


# Example3 Surprise 패키지를 이용한 넷플릭스 데이터와 유사한 데이터에 협업 필터링을 수행

In [13]:
A = defaultdict(list)
print(A)
A['key2'] = 'A'
print(A['key3'])
print(A)

defaultdict(<class 'list'>, {})
[]
defaultdict(<class 'list'>, {'key2': 'A', 'key3': []})


In [14]:
# 협업 필터링을 위한 파이썬 코드: 데이터셋 준비 및 헬퍼 정의 기능
import random
random.seed(0)
nratings = 5000
randomData = pd.DataFrame({
    'itemID':[random.randint(0,99) for _ in range(nratings)],
    'userID':[random.randint(0,999) for _ in range(nratings)],
    'rating':[random.randint(1,5) for _ in range(nratings)],
})
def get_top_n(predictions, n=10):
    # First map the predictions to each user.
    byUser = defaultdict(list) # 값을 지정하지 않으면 리스트로 초기화 값을 지정하면 그 값으로 설정
    for p in predictions:
        byUser[p.uid].append(p)
    
    # For each user, reduce predictions to top-n
    for uid, userPredictions in byUser.items():
        # heapq 모듈을 통해 heap 자료구조를 선언한 이후
        # heapq 모듈에 있는 nlargest()와 nsmallest() 함수를 사용해서 최대 or 최소값을 찾을 수 있다. 
        byUser[uid] = heapq.nlargest(n,userPredictions, key = lambda p: p.est)
    return byUser

# Surprise 패키지 사용
* Surprise에는 추천 시스템 테스트를 위한 많은 기능이 있다.
* Surprise에서는 다양한 종류의 알고리즘을 사용할 수 있는데 사용 가능한 알고리즘은 다음과 같다.
* `random_pred.NormalPredictior` : Training set의 분포가 정규분포라고 가정한 상태에서 무작위로 평점을 무작위로 추출하는 알고리즘 일반적으로 성능이 안좋다.
* `baseline_only.BaselineOnly` : 사용자의 평점평균과 아이템의 평점평균을 모델화해서 예측하는 알고리즘
* `knns.KNNBasic` : 이웃을 고려한 기본적인 CF알고리즘
* `knns.KNNWithMeans`: 사용자의 평가경향을 고려한 CF 알고리즘
* `knns.KNNWithZScore`: 사용자의 평가경향을 표준화시킨 CF 알고리즘
* `knns.KNNBaseline` : 사용자의 평점평균과 아이템의 평점평균을 모델화시킨 것을 고려한 CF 알고리즘
* `matrix_factorization.SVD` : MF 알고리즘
* `matrix_factorization.SVDpp` : MF를 기반으로 사용자의 특정 아이템에 대한 평가여부를 이진값으로 일종의 암묵적 평가(implicit ratings)로 추가한 SVD++ 알고리즘
* `matrix_factorization.NMF` : 행렬의 값이 전부 양수일 때 사용 가능한 MF 알고리즘
* `slope_one.SlopeOne` : 간단하면서도 정확도가 높은 것이 특징인 SlopeOne알고리즘을 적용한 Item-based CF 알고리즘
* `co_clustering.CoClustering` : 사용자와 아이템을 동시에 클러스터링하는 기법을 적용한 CF 알고리즘

* docs https://surprise.readthedocs.io/en/stable/predictions_module.html?highlight=est#surprise.prediction_algorithms.predictions.Prediction

In [16]:
# Convert the data set into the format required by the surprise package
# The columns must correspond to user id, item id, and ratings (in that order)
reader = Reader(rating_scale=(1,5))
data = Dataset.load_from_df(randomData[['userID','itemID','rating']],reader)

# Split into training and test set
trainset, testset = train_test_split(data,test_size = 0.25, random_state = 1)
trainset

<surprise.trainset.Trainset at 0x7fab26f05b20>

* uid = user id
* iid = item id
* r_ui = 실제 rating
* est = 추정 rating

In [19]:
## User-based filtering
# compute cosine similarity between users
sim_options = {'name':'cosine', 'user_based':True}
algo = KNNBasic(sim_options = sim_options)
algo.fit(trainset)

# predict ratings for all pairs (u,i) that are NOT in the training set
predictions = algo.test(testset)
predictions 

Computing the cosine similarity matrix...
Done computing similarity matrix.


[Prediction(uid=6, iid=77, r_ui=4.0, est=2.5, details={'actual_k': 2, 'was_impossible': False}),
 Prediction(uid=222, iid=77, r_ui=1.0, est=3.5, details={'actual_k': 8, 'was_impossible': False}),
 Prediction(uid=424, iid=45, r_ui=3.0, est=3.1023181005456952, details={'actual_k': 9, 'was_impossible': False}),
 Prediction(uid=87, iid=27, r_ui=1.0, est=3.0, details={'actual_k': 1, 'was_impossible': False}),
 Prediction(uid=121, iid=98, r_ui=2.0, est=3.4808722229993214, details={'actual_k': 6, 'was_impossible': False}),
 Prediction(uid=357, iid=14, r_ui=2.0, est=2.25, details={'actual_k': 4, 'was_impossible': False}),
 Prediction(uid=404, iid=4, r_ui=2.0, est=2.9885333333333333, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid=900, iid=65, r_ui=5.0, est=2.857142857142857, details={'actual_k': 7, 'was_impossible': False}),
 Prediction(uid=185, iid=53, r_ui=2.0, est=3.625, details={'actual_k': 8, 'was_impossible': False}),
 Prediction(uid=875, iid=

In [26]:
# Print the recommended items for each user
top_n = get_top_n(predictions,n=4)
print('Top-4 recommended items for each user')
for uid,user_ratings in list(top_n.items())[:5]:
    print('User {}'.format(uid))
    for prediction in user_ratings:
        print(f'Item {prediction.iid} ({prediction.est:.2f})',end = '')
        print()

Top-4 recommended items for each user
User 6
Item 6 (5.00)
Item 77 (2.50)
Item 60 (1.00)
User 222
Item 77 (3.50)
Item 75 (2.78)
User 424
Item 14 (3.50)
Item 45 (3.10)
Item 54 (2.34)
User 87
Item 27 (3.00)
Item 54 (3.00)
Item 82 (3.00)
Item 32 (1.00)
User 121
Item 98 (3.48)
Item 32 (2.83)


In [27]:
trainset = data.build_full_trainset()
sim_options = {'name':'cosine','user_based':False}
algo = KNNBasic(sim_options = sim_options)
algo.fit(trainset)

# predict rating for user 383 and item 7
algo.predict(383,7)

Computing the cosine similarity matrix...
Done computing similarity matrix.


Prediction(uid=383, iid=7, r_ui=None, est=2.3661840936304324, details={'actual_k': 4, 'was_impossible': False})