# 가짜연구소 4기 - 추천시스템 톺아보기
## Association analysis
- 작성자: 김민수(kimminsu.ds@gmail.com)
- 출처: https://archive.ics.uci.edu/ml/datasets/online+retail#

#### Abstract
- This is a transaction data set which contains all the transactions occuring between 01/12/2010 and 09/12/2011 for a UK-based registered non-store online retail

#### Attritbute Information
- InvoiceNo: Invoce number
    - If this code starts with letter 'C', it indicates a cancellation
- StockCode: Product(item) code
- Description: Product(item) name
- Quantity: The quantities of each product (item) per transaction
- InvoceDate: The day and time when each transaction was generated
- UnitPrice: Product price per unit in sterling
- CustomerID: Customer number
- Country: The name of country where each customer resides

## 00. 환경설정

### 00-01. 패키지

In [1]:
import pandas as pd
from datetime import datetime

### 00-02. UCI Online retail 데이터

In [2]:
path = "../data/online retail.csv"
parse_date = lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M')
df = pd.read_csv(path, parse_dates=['InvoiceDate'], date_parser=parse_date)

#### 데이터 확인

In [3]:
df.shape

(541909, 8)

In [4]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
InvoiceNo      541909 non-null object
StockCode      541909 non-null object
Description    540455 non-null object
Quantity       541909 non-null int64
InvoiceDate    541909 non-null datetime64[ns]
UnitPrice      541909 non-null float64
CustomerID     406829 non-null float64
Country        541909 non-null object
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 33.1+ MB


In [6]:
df.isnull().sum()

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64

#### 환불 데이터 제외

In [7]:
idx = [i for i, x in zip(df.index, df.InvoiceNo) if x.startswith('C')]
idx[:5]

[141, 154, 235, 236, 237]

In [8]:
df = df[~df.index.isin(idx)].reset_index(drop=True)
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


#### Quantity, UnitPrice 0 이하 데이터 제외

In [9]:
df = df[df['Quantity']  > 0].reset_index(drop=True)
df = df[df['UnitPrice'] > 0].reset_index(drop=True)
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [10]:
df.shape

(530104, 8)

## 01. mlxtend 패키지를 활용한 연관분석

### 01-01. 데이터 준비

#### InvoiceNo별 데이터프레임 생성

In [11]:
transaction_df = pd.DataFrame({
    "items"       : df.groupby(['InvoiceNo'])['StockCode'].apply(set),
    "item_length" : df.groupby(['InvoiceNo'])['StockCode'].nunique()    
}).reset_index()

transaction_df

Unnamed: 0,InvoiceNo,items,item_length
0,536365,"{21730, 85123A, 71053, 84029E, 22752, 84406B, ...",7
1,536366,"{22633, 22632}",2
2,536367,"{22749, 21777, 22745, 48187, 84969, 22748, 217...",12
3,536368,"{22960, 22912, 22913, 22914}",4
4,536369,{21756},1
...,...,...,...
19955,581584,"{85038, 20832}",2
19956,581585,"{16016, 22460, 22915, 22481, 84945, 21916, 227...",21
19957,581586,"{20685, 23275, 22061, 21217}",4
19958,581587,"{22556, 22631, 22367, 22726, 22899, 23255, 232...",15


#### 한 개의 item만 포함된 transaction data는 연관분석 불가

In [12]:
transaction_df[transaction_df['item_length']==1].head()

Unnamed: 0,InvoiceNo,items,item_length
4,536369,{21756},1
6,536371,{22086},1
9,536374,{21258},1
14,536380,{22961},1
25,536393,{22180},1


In [13]:
transaction_df = transaction_df[transaction_df['item_length'] > 1].reset_index(drop=True)
transaction_df.head()

Unnamed: 0,InvoiceNo,items,item_length
0,536365,"{21730, 85123A, 71053, 84029E, 22752, 84406B, ...",7
1,536366,"{22633, 22632}",2
2,536367,"{22749, 21777, 22745, 48187, 84969, 22748, 217...",12
3,536368,"{22960, 22912, 22913, 22914}",4
4,536370,"{22631, 22661, 21913, POST, 22727, 22726, 2232...",20


#### TransactionEncoder를 활용한 데이터 변환

In [14]:
from mlxtend.preprocessing import TransactionEncoder

dataset = []

for index, row in transaction_df.iterrows():
    dataset.append(list(row['items']))
    
dataset[:2]

[['21730', '85123A', '71053', '84029E', '22752', '84406B', '84029G'],
 ['22633', '22632']]

In [15]:
te = TransactionEncoder()
te_ary = te.fit_transform(dataset)
te_ary

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [16]:
te_df = pd.DataFrame(te_ary, columns=te.columns_)
te_df

Unnamed: 0,10002,10080,10120,10123C,10124A,10124G,10125,10133,10135,11001,...,DOT,M,PADS,POST,gift_0001_10,gift_0001_20,gift_0001_30,gift_0001_40,gift_0001_50,m
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,True,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18314,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
18315,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
18316,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
18317,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [18]:
len(transaction_df) == len(te_df)

True

### 01-02. apriori 알고리즘
- [apriori: Frequent itemsets via the Apriori algorithm](http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/)
    - Apriori function to extract frequent itemsets for association rule mining
- **`apriori()`** 함수를 통해 최소 지지도 0.01 조건을 만족하는 빈발항목집합(frequent itemsets) 생성

In [19]:
%%time

from mlxtend.frequent_patterns import apriori

min_support = 0.01
frequent_itemsets_apriori = apriori(te_df, min_support = min_support, use_colnames=True)
frequent_itemsets_apriori 

Wall time: 4min 28s


Unnamed: 0,support,itemsets
0,0.010699,(10133)
1,0.027840,(15036)
2,0.017414,(15056BL)
3,0.024565,(15056N)
4,0.013538,(16161P)
...,...,...
2567,0.016431,"(22423, 22697, 22698, 22699)"
2568,0.010153,"(22917, 22916, 22918, 22919)"
2569,0.010153,"(22917, 22916, 22920, 22918)"
2570,0.010044,"(23203, 23202, 23199, 23200)"


### 01-03. fp-growth 알고리즘

- [fpgrowth: Frequent itemsets via the FP-growth algorithm](http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/fpgrowth/)
    - Function implementing FP-Growth to extract frequent itemsets for association rule mining
- **`fpgrowth()`** 함수를 통해 최소 지지도 0.01 조건을 만족하는 빈발항목집합(frequent itemsets) 생성

In [20]:
%%time

from mlxtend.frequent_patterns import fpgrowth

min_support = 0.01
frequent_itemsets_fpgrowth = fpgrowth(te_df, min_support = min_support, use_colnames=True)
frequent_itemsets_fpgrowth

Wall time: 10.6 s


Unnamed: 0,support,itemsets
0,0.118948,(85123A)
1,0.025056,(84029G)
2,0.023255,(84029E)
3,0.020743,(22752)
4,0.018178,(71053)
...,...,...
2567,0.012883,"(23355, 23356)"
2568,0.014684,"(23355, 22112)"
2569,0.010372,"(22633, 23439)"
2570,0.011900,"(22865, 23439)"


## 02. 연관규칙 생성

### 02-01. mlxtend association_rules를 활용한 연관규칙 생성
- [association_rules: Association rules generation from frequent itemsets](http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/)
    - Function to generate association rules from frequent itemsets

```python
from mlxtend.frequent_patterns import association_rules

association_rules(frequent itemsets, metric= "support/confidence/lift", min_threshold = metric의 최솟값)
```

- 최소 신뢰도 0.2 조건을 만족하는 총 4,287개의 연관규칙 생성

In [20]:
from mlxtend.frequent_patterns import association_rules

result = association_rules(frequent_itemsets_fpgrowth, metric="confidence", min_threshold=0.2)
result.sort_values(by=['lift'], ascending=False, inplace=True)
result.reset_index(drop=True, inplace=True)
result

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,"(22916, 22919)","(22918, 22917)",0.011245,0.011736,0.010153,0.902913,76.932355,0.010021,10.179115
1,"(22918, 22917)","(22916, 22919)",0.011736,0.011245,0.010153,0.865116,76.932355,0.010021,7.330424
2,"(22918, 22917)","(22916, 22920)",0.011736,0.011464,0.010153,0.865116,75.466977,0.010019,7.328805
3,"(22916, 22920)","(22918, 22917)",0.011464,0.011736,0.010153,0.885714,75.466977,0.010019,8.647306
4,"(22916, 22918)","(22917, 22919)",0.011627,0.011573,0.010153,0.873239,75.456949,0.010019,7.797593
...,...,...,...,...,...,...,...,...,...
4282,(21931),(85123A),0.064632,0.118948,0.013483,0.208615,1.753839,0.005795,1.113304
4283,(20727),(85123A),0.069382,0.118948,0.014302,0.206137,1.733007,0.006049,1.109829
4284,(20728),(85123A),0.062722,0.118948,0.012883,0.205396,1.726778,0.005422,1.108794
4285,(21080),(85123A),0.053715,0.118948,0.010918,0.203252,1.708754,0.004528,1.105811


- lift 기준 상위 2개 연관규칙을 구성하는 item

In [21]:
df.loc[df['StockCode'].isin(['22919', '22916', '22917', '22918']), ['StockCode', "Description"]].drop_duplicates()

Unnamed: 0,StockCode,Description
962,22919,HERB MARKER MINT
963,22917,HERB MARKER ROSEMARY
966,22918,HERB MARKER PARSLEY
967,22916,HERB MARKER THYME


### 02-02. 연관분석을 활용한 추천

In [22]:
df['StockCode'].value_counts()

85123A      2265
85099B      2112
22423       2017
47566       1706
20725       1595
            ... 
21009          1
82613a         1
90181C         1
16169N         1
DCGS0069       1
Name: StockCode, Length: 3922, dtype: int64

In [23]:
df.loc[df['StockCode']=='85123A', 'Description'].unique()

array(['WHITE HANGING HEART T-LIGHT HOLDER',
       'CREAM HANGING HEART T-LIGHT HOLDER'], dtype=object)

- antecedent로 **`85123A`**가 주어졌을 때 최소 신뢰도 0.2를 만족하는 상위 10개 연관규칙 바탕으로 추천

In [24]:
min_confidence = 0.2
k = 10

antecedent = ['85123A']
result = association_rules(frequent_itemsets_fpgrowth, metric="confidence", min_threshold=min_confidence)
result[(result['antecedents'] == frozenset(antecedent))].sort_values(by='lift', ascending=False)[:k]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
126,(85123A),(21733),0.118948,0.040286,0.026694,0.224415,5.570537,0.021902,1.237406


- antecedent로 **`85099B`**가 주어졌을 때 최소 신뢰도 0.2를 만족하는 상위 10개 연관규칙 추천

In [25]:
df.loc[df['StockCode']=='85099B', 'Description'].unique()

array(['JUMBO BAG RED RETROSPOT'], dtype=object)

In [26]:
min_confidence = 0.2
k = 10

antecedent = ['85099B']
result = association_rules(frequent_itemsets_fpgrowth, metric="confidence", min_threshold=min_confidence)
result[(result['antecedents'] == frozenset(antecedent))].sort_values(by='lift', ascending=False)[:k]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
2873,(85099B),(DOT),0.11327,0.038539,0.026475,0.233735,6.064859,0.02211,1.254737
156,(85099B),(22386),0.11327,0.066379,0.045035,0.39759,5.989686,0.037516,1.549811
2322,(85099B),(21928),0.11327,0.043943,0.029532,0.260723,5.933146,0.024555,1.293232
1613,(85099B),(85099F),0.11327,0.044326,0.029259,0.258313,5.827636,0.024238,1.288515
212,(85099B),(85099C),0.11327,0.050603,0.031934,0.281928,5.571342,0.026202,1.322147
3337,(85099B),(22385),0.11327,0.037393,0.023418,0.206747,5.529048,0.019183,1.213493
2040,(85099B),(20712),0.11327,0.047055,0.029205,0.257831,5.479364,0.023875,1.284001
178,(85099B),(21931),0.11327,0.064632,0.039522,0.348916,5.398468,0.032201,1.436631
306,(85099B),(21929),0.11327,0.047219,0.028277,0.249639,5.286854,0.022928,1.269763
664,(85099B),(22411),0.11327,0.064086,0.03712,0.327711,5.113573,0.029861,1.392129
