-
Notifications
You must be signed in to change notification settings - Fork 36
/
apriori.py
179 lines (153 loc) · 7.02 KB
/
apriori.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
# -*- coding: utf-8 -*-
"""
Created on Wed Apr 08 17:43:21 2020
@author: wenbin
"""
from numpy import *
import pandas as pd
# 构造数据
def loadDataSet():
return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]
# 将所有元素转换为frozenset型字典,存放到列表中
def createC1(dataSet):
C1 = []
for transaction in dataSet:
for item in transaction:
if not [item] in C1:
C1.append([item])
C1.sort()
# 使用frozenset是为了后面可以将这些值作为字典的键
return list(map(frozenset, C1)) # frozenset一种不可变的集合,set可变集合
# 过滤掉不符合支持度的集合
# 返回 频繁项集列表retList 所有元素的支持度字典
def scanD(D, Ck, minSupport):
ssCnt = {}
for tid in D:
for can in Ck:
if can.issubset(tid): # 判断can是否是tid的《子集》 (这里使用子集的方式来判断两者的关系)
if can not in ssCnt: # 统计该值在整个记录中满足子集的次数(以字典的形式记录,frozenset为键)
ssCnt[can] = 1
else:
ssCnt[can] += 1
numItems = float(len(D))
retList = [] # 重新记录满足条件的数据值(即支持度大于阈值的数据)
supportData = {} # 每个数据值的支持度
for key in ssCnt:
support = ssCnt[key] / numItems
if support >= minSupport:
retList.insert(0, key)
supportData[key] = support
return retList, supportData # 排除不符合支持度元素后的元素 每个元素支持度
# 生成所有可以组合的集合
# 频繁项集列表Lk 项集元素个数k [frozenset({2, 3}), frozenset({3, 5})] -> [frozenset({2, 3, 5})]
def aprioriGen(Lk, k):
retList = []
lenLk = len(Lk)
for i in range(lenLk): # 两层循环比较Lk中的每个元素与其它元素
for j in range(i+1, lenLk):
L1 = list(Lk[i])[:k-2] # 将集合转为list后取值
L2 = list(Lk[j])[:k-2]
L1.sort(); L2.sort() # 这里说明一下:该函数每次比较两个list的前k-2个元素,如果相同则求并集得到k个元素的集合
if L1==L2:
retList.append(Lk[i] | Lk[j]) # 求并集
return retList # 返回频繁项集列表Ck
# 封装所有步骤的函数
# 返回 所有满足大于阈值的组合 集合支持度列表
def apriori(dataSet, minSupport = 0.5):
D = list(map(set, dataSet)) # 转换列表记录为字典 [{1, 3, 4}, {2, 3, 5}, {1, 2, 3, 5}, {2, 5}]
C1 = createC1(dataSet) # 将每个元素转会为frozenset字典 [frozenset({1}), frozenset({2}), frozenset({3}), frozenset({4}), frozenset({5})]
L1, supportData = scanD(D, C1, minSupport) # 过滤数据
L = [L1]
k = 2
while (len(L[k-2]) > 0): # 若仍有满足支持度的集合则继续做关联分析
Ck = aprioriGen(L[k-2], k) # Ck候选频繁项集
Lk, supK = scanD(D, Ck, minSupport) # Lk频繁项集
supportData.update(supK) # 更新字典(把新出现的集合:支持度加入到supportData中)
L.append(Lk)
k += 1 # 每次新组合的元素都只增加了一个,所以k也+1(k表示元素个数)
return L, supportData
####################################################################################
# 关联规则挖掘(来源于网络的代码,关联规则挖掘这一部分是错误的)
# 获取关联规则的封装函数
def generateRules(L, supportData, minConf=0.7): # supportData 是一个字典
bigRuleList = []
for i in range(1, len(L)): # 从为2个元素的集合开始
for freqSet in L[i]:
# 只包含单个元素的集合列表
H1 = [frozenset([item]) for item in freqSet] # frozenset({2, 3}) 转换为 [frozenset({2}), frozenset({3})]
# 如果集合元素大于2个,则需要处理才能获得规则
if (i > 1):
rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf) # 集合元素 集合拆分后的列表 。。。
else:
calcConf(freqSet, H1, supportData, bigRuleList, minConf)
return bigRuleList
# 对规则进行评估 获得满足最小可信度的关联规则
def calcConf(freqSet, H, supportData, brl, minConf=0.7):
prunedH = [] # 创建一个新的列表去返回
for conseq in H:
conf = supportData[freqSet]/supportData[freqSet-conseq] # 计算置信度
if conf >= minConf:
print(freqSet-conseq,'-->',conseq,'conf:',conf)
brl.append((freqSet-conseq, conseq, conf))
prunedH.append(conseq)
return prunedH
# 生成候选规则集合
def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7):
m = len(H[0])
if (len(freqSet) > (m + 1)): # 尝试进一步合并
Hmp1 = aprioriGen(H, m+1) # 将单个集合元素两两合并
Hmp1 = calcConf(freqSet, Hmp1, supportData, brl, minConf)
if (len(Hmp1) > 1): #need at least two sets to merge
rulesFromConseq(freqSet, Hmp1, supportData, brl, minConf)
#############################################################################
###原来的关联规则函数是错误的,没有考虑到所有情况,因此我重写了一个.
def calSupport(ab , dataSet):
count = 0
for i in range(len(dataSet)):
if ab.issubset(set(dataSet[i])) :
count += 1
return count / len(dataSet)
def calCon(a , b , dataSet , supportData ):
tmp1 = set(a) ; tmp2 = set(b)
ab = frozenset(tmp1.union(tmp2))
if ab in supportData.keys():
con = supportData[ab] / supportData[a]
else:
tmp = calSupport(ab , dataSet)
con = tmp / supportData[a]
return con
def geneRule(L, supportData,dataSet , minConf=0.7 ):
for i in range(len(L)):
for j in range(len(L)):
for k in range(len(L[i])):
for m in range(len(L[j])):
if i == j and k == m:
pass
else:
tmp = calCon(L[i][k] , L[j][m] , dataSet , supportData )
if tmp >= minConf:
print(L[i][k] , " -> " , L[j][m] , " , confidence = " , tmp)
#############################################################################
def splitstr(x):
x = x[1:-1]
x = x.split(",")
return x
def GetData():
# read data from csv
df = pd.read_csv("Groceries.csv" , encoding = 'gbk')
df = df[['items']]
df = df.applymap(splitstr)
data = []
for index in range(len(df)):
data.append(df.iloc[index , 0])
return data
if __name__ == "__main__":
dataSet = GetData()
# dataSet = loadDataSet()
L,suppData = apriori(dataSet,minSupport=0.15)
del L[-1]
geneRule(L,suppData, dataSet , minConf=0.2 )
print("频繁项集:")
print(L)
# print("rules:")
# print(rules)