In [1]:
import pandas as pd
import numpy
import time
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, fpgrowth, association_rules

In [2]:
#檔案匯入
original_data = pd.read_csv("C:\\交易資料集.csv")

In [3]:
#資料預處理
original_data = original_data[original_data.QUANTITY >0] #數量為零或負值的交易代表退貨或註銷，應於前置處理中剔除
original_data = original_data[["ITEM_ID", "INVOICE_NO"]]

In [4]:
#資料分組(以發票號碼分組)
X = original_data.groupby(by=["INVOICE_NO"])
X = X.groups

invoice_no = []
items = []
for key in X:
    item = []
    for value in X.get(key):
        item.append(original_data.at[value, "ITEM_ID"])
    invoice_no.append(key)
    items.append(item)

In [5]:
#模型資料
data = {
    "invoice_no": invoice_no,
    "items": items
}
data = pd.DataFrame(data)

te = TransactionEncoder()
te_items = te.fit(items).transform(items)
te_items = pd.DataFrame(te_items, columns=te.columns_)

In [24]:
#Apriori
t1 = time.perf_counter()
result = apriori(te_items, min_support=0.0008,use_colnames=True, verbose=1)
t2 = time.perf_counter()
print("Apriori時間:", t2-t1, "秒")

Apriori_result_1 = association_rules(result, metric="lift", min_threshold=1)
Apriori_result_1

Processing 30 combinations | Sampling itemset size 5 432
Apriori時間: 22.52759429999992 秒


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(1696),(70509),0.002137,0.008137,0.000877,0.410256,50.415954,0.000859,1.681854
1,(70509),(1696),0.008137,0.002137,0.000877,0.107744,50.415954,0.000859,1.118360
2,(1697),(1698),0.002822,0.003534,0.001069,0.378641,107.128923,0.001059,1.603687
3,(1698),(1697),0.003534,0.002822,0.001069,0.302326,107.128923,0.001059,1.429288
4,(1697),(23004),0.002822,0.004740,0.000932,0.330097,69.640945,0.000918,1.485678
...,...,...,...,...,...,...,...,...,...
577,"(135492, 70509)","(88411, 135493)",0.001041,0.001260,0.000849,0.815789,647.275744,0.000848,5.421730
578,(88411),"(135492, 135493, 70509)",0.004576,0.000904,0.000849,0.185629,205.305389,0.000845,1.226831
579,(135493),"(88411, 135492, 70509)",0.001671,0.000959,0.000849,0.508197,529.947541,0.000848,2.031383
580,(70509),"(88411, 135492, 135493)",0.008137,0.000904,0.000849,0.104377,115.441077,0.000842,1.115532


In [6]:
#Apriori
t1 = time.perf_counter()
result = apriori(te_items, min_support=0.001, use_colnames=True, verbose=1)
t2 = time.perf_counter()
print("Apriori時間:", t2-t1, "秒")

Apriori_result_1 = association_rules(result, metric="lift", min_threshold=1)
Apriori_result_1

Processing 52 combinations | Sampling itemset size 4 32
Apriori時間: 10.533637599999999 秒


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(1697),(1698),0.002822,0.003534,0.001069,0.378641,107.128923,0.001059,1.603687
1,(1698),(1697),0.003534,0.002822,0.001069,0.302326,107.128923,0.001059,1.429288
2,(1697),(70509),0.002822,0.008137,0.001562,0.553398,68.006472,0.001539,2.220910
3,(70509),(1697),0.008137,0.002822,0.001562,0.191919,68.006472,0.001539,1.234008
4,(1697),(88411),0.002822,0.004576,0.001315,0.466019,101.848962,0.001302,1.864158
...,...,...,...,...,...,...,...,...,...
137,"(3336149, 70509)",(88411),0.001233,0.004576,0.001041,0.844444,184.554092,0.001036,6.399157
138,"(88411, 70509)",(3336149),0.002082,0.002274,0.001041,0.500000,219.867470,0.001036,1.995452
139,(3336149),"(88411, 70509)",0.002274,0.002082,0.001041,0.457831,219.867470,0.001036,1.840604
140,(88411),"(3336149, 70509)",0.004576,0.001233,0.001041,0.227545,184.554092,0.001036,1.292978


In [18]:
#Apriori 0.002
t1 = time.perf_counter()
result = apriori(te_items, min_support=0.002, use_colnames=True, verbose=1)
t2 = time.perf_counter()
print("Apriori時間:", t2-t1, "秒")

Apriori_result_1 = association_rules(result, metric="lift", min_threshold=1)
Apriori_result_1

Processing 3 combinations | Sampling itemset size 3e 2
Apriori時間: 1.7087598000000526 秒


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(1698),(70509),0.003534,0.008137,0.002055,0.581395,71.447028,0.002026,2.369449
1,(70509),(1698),0.008137,0.003534,0.002055,0.252525,71.447028,0.002026,1.333109
2,(88411),(70509),0.004576,0.008137,0.002082,0.45509,55.925482,0.002045,1.820231
3,(70509),(88411),0.008137,0.004576,0.002082,0.255892,55.925482,0.002045,1.337742


In [14]:
#FP-Growth support = 0.0008
t1 = time.perf_counter()
result = fpgrowth(te_items, min_support=0.0008, use_colnames=True)
t2 = time.perf_counter()
print("FP-Growth時間:", t2-t1, "秒")

FP_Growth_result_1 = association_rules(result, metric="lift", min_threshold=1)
FP_Growth_result_1

FP-Growth時間: 1.0959665000000314 秒


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(15157064),(15192100),0.002055,0.001891,0.000932,0.453333,239.793623,0.000928,1.825810
1,(15192100),(15157064),0.001891,0.002055,0.000932,0.492754,239.793623,0.000928,1.967377
2,(14671860),(15192100),0.001644,0.001891,0.001096,0.666667,352.637681,0.001093,2.994328
3,(15192100),(14671860),0.001891,0.001644,0.001096,0.579710,352.637681,0.001093,2.375399
4,(14675955),(15192100),0.001452,0.001891,0.001041,0.716981,379.251846,0.001038,3.526654
...,...,...,...,...,...,...,...,...,...
577,(15280990),(15336088),0.007891,0.002466,0.000822,0.104167,42.243056,0.000803,1.113526
578,(15336088),(15336004),0.002466,0.002822,0.000877,0.355556,125.990939,0.000870,1.547345
579,(15336004),(15336088),0.002822,0.002466,0.000877,0.310680,125.990939,0.000870,1.447127
580,(15419008),(15306063),0.003534,0.002438,0.000932,0.263566,108.085707,0.000923,1.354584


In [20]:
#FP-Growth support = 0.002
t1 = time.perf_counter()
result = fpgrowth(te_items, min_support=0.002, use_colnames=True)
t2 = time.perf_counter()
print("FP-Growth時間:", t2-t1, "秒")

FP_Growth_result_1 = association_rules(result, metric="lift", min_threshold=1)
FP_Growth_result_1

FP-Growth時間: 1.0517237999999907 秒


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(88411),(70509),0.004576,0.008137,0.002082,0.45509,55.925482,0.002045,1.820231
1,(70509),(88411),0.008137,0.004576,0.002082,0.255892,55.925482,0.002045,1.337742
2,(1698),(70509),0.003534,0.008137,0.002055,0.581395,71.447028,0.002026,2.369449
3,(70509),(1698),0.008137,0.003534,0.002055,0.252525,71.447028,0.002026,1.333109


In [8]:
#FP-Growth support = 0.001
t1 = time.perf_counter()
result = fpgrowth(te_items, min_support=0.001, use_colnames=True)
t2 = time.perf_counter()
print("FP-Growth時間:", t2-t1, "秒")

FP_Growth_result_1 = association_rules(result, metric="lift", min_threshold=1)
FP_Growth_result_1

FP-Growth時間: 1.0690787999999998 秒


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(14671860),(15192100),0.001644,0.001891,0.001096,0.666667,352.637681,0.001093,2.994328
1,(15192100),(14671860),0.001891,0.001644,0.001096,0.579710,352.637681,0.001093,2.375399
2,(15194056),(15192100),0.001452,0.001891,0.001123,0.773585,409.192781,0.001121,4.408317
3,(15192100),(15194056),0.001891,0.001452,0.001123,0.594203,409.192781,0.001121,2.460707
4,(14675955),(15192100),0.001452,0.001891,0.001041,0.716981,379.251846,0.001038,3.526654
...,...,...,...,...,...,...,...,...,...
137,(15280990),(15286972),0.007891,0.004302,0.001534,0.194444,45.202760,0.001500,1.236039
138,(14790248),(14782215),0.005315,0.003123,0.001425,0.268041,85.815518,0.001408,1.361930
139,(14782215),(14790248),0.003123,0.005315,0.001425,0.456140,85.815518,0.001408,1.828936
140,(14790248),(14929876),0.005315,0.002603,0.001096,0.206186,79.214324,0.001082,1.256461


In [9]:
#存檔 & 讀檔
Apriori_result_1[["antecedents", "consequents"]].to_csv("Apriori_result_1.csv")
pd.read_csv("Apriori_result_1.csv")

FP_Growth_result_1[["antecedents", "consequents"]].to_csv("FP_Growth_result_1.csv")
pd.read_csv("FP_Growth_result_1.csv")

Unnamed: 0.1,Unnamed: 0,antecedents,consequents
0,0,frozenset({14671860}),frozenset({15192100})
1,1,frozenset({15192100}),frozenset({14671860})
2,2,frozenset({15194056}),frozenset({15192100})
3,3,frozenset({15192100}),frozenset({15194056})
4,4,frozenset({14675955}),frozenset({15192100})
...,...,...,...
137,137,frozenset({15280990}),frozenset({15286972})
138,138,frozenset({14790248}),frozenset({14782215})
139,139,frozenset({14782215}),frozenset({14790248})
140,140,frozenset({14790248}),frozenset({14929876})


In [10]:
#推薦產品
items = input("請輸入產品ID(如有多項產品請以逗號隔開，如:1, 2, 3):")
items = [int(n) for n in items.split(", ")]
items.sort()

antecedents = FP_Growth_result_1["antecedents"]
consequents = FP_Growth_result_1["consequents"]

recommend_list = []
index = -1
for i in antecedents:
    index += 1
    f = True
    data = list(i)
    
    for item in items:
        if item not in data:
            f = False
    if not f:
        continue
            
    for recommend in list(consequents[index]):
        recommend_list.append(recommend)
        
print("向您推薦ID", set(recommend_list), "之商品")

請輸入產品ID(如有多項產品請以逗號隔開，如:1, 2, 3):15194056
向您推薦ID {15192100} 之商品
