In [1]:
# Set up
%pylab
%pylab inline
%matplotlib inline
%load_ext autoreload
%autoreload 2
import tqdm
import random
import pandas as pd
from collections import Counter, defaultdict
from itertools import cycle, combinations
from operator import itemgetter

from sklearn import datasets, metrics, tree

import spectral
import seaborn as sns 
import tqdm
import copy 
import scipy

Using matplotlib backend: TkAgg
Populating the interactive namespace from numpy and matplotlib
Populating the interactive namespace from numpy and matplotlib


In [186]:
class APRIORI:
    def __init__(self, transactions, support_count=100, delimiter=' '):
        self.records_no = len(transactions)
        self.support_count = support_count
        self.data = transactions
        self.columns = transactions.columns
        self.delimiter = delimiter
        
        '''
        k-th element in frequent_sets:
            list of tuples st.
            (set of items_ids, frequency, set_size = k)
        '''
        self.frequent_sets = [self.get_items_count()]
        
        self.data['set_size'] = self.data[self.columns[0]].str.count(delimiter) + 1
        self.data[self.columns[0]].dropna()
        self.data[self.columns[0]] = self.data[self.columns[0]].apply( \
                                     lambda row: set(map(int, row.strip().split(delimiter))))
        
        # L1 = 1-element frequent sets
        self.single_items_set = set()
        for t in self.frequent_sets[0]:
            self.single_items_set = self.single_items_set.union(t[0])
    
    
    def get_items_count(self):
        single_items = defaultdict(int)
        
        for i in tqdm.tqdm(range(len(self.data)), desc='Creating single frequent sets', \
                           position=0, leave=True):
            row = self.data.iloc[i][0].strip().split(self.delimiter)
            for item in row:
                single_items[int(item)] += 1

        res = []
        for item, cnt in single_items.items():
            if cnt >= self.support_count:
                res.append((set([item]), cnt, 1))
        return res
    
    
    def gen_Ck(self, k):
        Lk = []
        for t in self.frequent_sets[k - 1]:
            Lk.append(t[0])
        res = set()
#         a = time.time()
#         print(f'Generating time: {a}')
        for i in range(len(Lk)):
            x = Lk[i]
            for j in range(i, len(Lk)):
                y = Lk[j]
                united = x.union(y)
                if len(united) == k + 1:
                    res.add(tuple(united))
#         print(f'Ended, time = {time.time() - a}')
        return list(res)
                
    
    
    def RUN(self):
        length = 0
        for length in tqdm.tqdm(range(1, len(self.single_items_set)), \
                                desc='Expanding frequent sets', position=0, leave=True):

            self.data = self.data[self.data['set_size'] >= length]
            Ck = self.gen_Ck(length)
            d = defaultdict(int)

            for row in tqdm.tqdm(self.data[self.columns[0]], \
                                 desc='Creating Ck', position=0, leave=True):
                for items in Ck:
                    if set(items).issubset(row):
                        d[items] += 1
       
            if len(d) == 0:
                break
                
            res = []
            for items, cnt in d.items():
                if cnt >= self.support_count:
                    res.append((set(items), cnt, length + 1))
               
            if len(res) == 0:
                break
            
            self.frequent_sets.append(res)
            
            
    def create_assosiation_rules(self, set_size, confidence_threshold=0.4):
        support = {}
        assosiation_rules = []
            
        for Lk in self.frequent_sets:
            for t in Lk:
                items_set, cnt, sz = t
                items_set = tuple(sorted(items_set))
                support[items_set] = cnt
         
                S = []
                S.append([(items_set, None)])

                for k in range(0, len(items_set) - 1):
                    S.append([])
                    for rule in S[k]:
                        A = rule[0]
                        B = rule[1]
                        for i in range(len(A)):
                            tmp = B
                            if B is None:
                                tmp = (A[i], )
                            else: 
                                tmp += (A[i], )

                            A = tuple(sorted(set(A)))
                            tmp = tuple(sorted(set(tmp)))
                            sup_AC = support[A] / set_size
                            sup_A = support[(A[:i] + A[i+1:])] / set_size
                            sup_C = support[tmp] / set_size
                            confidence = sup_AC / sup_A
                            lift = confidence / sup_C
                            leverage = sup_AC - sup_A * sup_C
                            
                            if confidence > confidence_threshold:
                                S[k + 1].append((A[:i] + A[i+1:], tmp))
                                record = {'rule': str((A[:i] + A[i+1:], ' -> ', tmp)), 
                                         'confidence': confidence,
                                         'lift': lift,
                                         'leverage': leverage
                                         } 
                                assosiation_rules.append(record)
                            
        return assosiation_rules

# 0. Testing on wikipedia data <br/>
https://en.wikipedia.org/wiki/Apriori_algorithm

In [3]:
df = []
with open('Data/test.txt') as f:
    for x in f:
        df.append(x.split())
        
df = pd.DataFrame(df)
df

Unnamed: 0,0
0,1234
1,124
2,12
3,234
4,23
5,34
6,24


In [4]:
B = APRIORI(df.copy(), support_count=3, delimiter=',')
B.RUN()

Creating single frequent sets: 100%|███████████████████████████████████████████████████| 7/7 [00:00<00:00, 3508.62it/s]
Expanding frequent sets:   0%|                                                                   | 0/3 [00:00<?, ?it/s]


In [5]:
B.frequent_sets

[[({1}, 3, 1), ({2}, 6, 1), ({3}, 4, 1), ({4}, 5, 1)],
 [({1, 2}, 3, 2), ({2, 3}, 3, 2), ({3, 4}, 3, 2), ({2, 4}, 4, 2)]]

In [6]:
B.single_items_set

{1, 2, 3, 4}

In [7]:
B.frequent_sets

[[({1}, 3, 1), ({2}, 6, 1), ({3}, 4, 1), ({4}, 5, 1)],
 [({1, 2}, 3, 2), ({2, 3}, 3, 2), ({3, 4}, 3, 2), ({2, 4}, 4, 2)]]

In [8]:
pd.DataFrame.from_dict(B.create_assosiation_rules(set_size=df.shape[0], confidence_threshold=0.7))

Unnamed: 0,confidence,leverage,lift,rule
0,1.0,0.061224,1.166667,"((1,), ' -> ', (2,))"
1,0.75,-0.061224,0.875,"((3,), ' -> ', (2,))"
2,0.75,0.020408,1.05,"((3,), ' -> ', (4,))"
3,0.8,-0.040816,0.933333,"((4,), ' -> ', (2,))"


# 1. Some store DF

In [9]:
 def process_data(df):
        '''
        Returns pair X, Y, where
        X : list of transactions,
            where transaction is a list of products
            
        Y: unique list of all products
        '''
        records = []
        all_products = set()
        records_no, attrs_no = df.shape
        
        for i in range(records_no):
            row = df.iloc[i][df.iloc[i].notna()]
            records.append([str(x) for x in row])
            all_products = all_products.union(set(row))
            
        return records, all_products
    
    
def map_items_to_numbers(transactions, all_products):
    number_to_string = {}
    string_to_number = {}
    for i, item in enumerate(all_products):
        string_to_number[item] = i
        number_to_string[i] = item

    for i in range(len(transactions)):
        transactions[i] = list(map(lambda x: string_to_number[x],
                                            transactions[i]))

    return transactions, string_to_number, number_to_string

In [10]:
store_df = pd.read_csv('Data/store_data.csv', header=None)
store_df.shape

(7501, 20)

In [11]:
store_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
1,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
2,chutney,,,,,,,,,,,,,,,,,,,
3,turkey,avocado,,,,,,,,,,,,,,,,,,
4,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,


In [12]:
# map all transaction to numbers
transactions, all_products = process_data(store_df)

transactions, string_to_number, number_to_string = \
map_items_to_numbers(transactions, all_products)

transactions = [' '.join(map(str, row)) for row in transactions]
transactions = pd.DataFrame(transactions)

In [13]:
transactions.columns = ['items']
transactions.head()

Unnamed: 0,items
0,100 109 79 14 13 64 4 84 32 72 57 101 45 77 42...
1,22 31 30
2,17
3,16 79
4,42 58 53 108 101


In [14]:
support_count = 100
APR = APRIORI(transactions.copy(), support_count)

Creating single frequent sets: 100%|████████████████████████████████████████████| 7501/7501 [00:00<00:00, 10049.30it/s]


In [15]:
print(f'Found {sum([len(Lk) for Lk in APR.frequent_sets])} frequent sets')
for Lk in APR.frequent_sets[-1:]:
    for x in Lk[:5]:
        print(x)

Found 66 frequent sets
({100}, 536, 1)
({109}, 153, 1)
({79}, 250, 1)
({14}, 193, 1)
({84}, 239, 1)


In [16]:
%%time
APR.RUN()

Expanding frequent sets:   3%|█▊                                                        | 2/65 [00:05<02:43,  2.59s/it]


Wall time: 5.19 s


In [17]:
print(f'After running APRIORI {sum([len(Lk) for Lk in APR.frequent_sets])} frequent sets')
for Lk in APR.frequent_sets[-1:]:
    for x in Lk[:5]:
        print(x)

After running APRIORI 188 frequent sets
({42, 2, 30}, 107, 3)
({42, 2, 58}, 118, 3)
({2, 42, 38}, 128, 3)
({42, 2, 38}, 128, 3)
({42, 93, 30}, 101, 3)


In [18]:
pd.DataFrame.from_dict(APR.create_assosiation_rules(set_size=transactions.shape[0], confidence_threshold=0.4))

Unnamed: 0,confidence,leverage,lift,rule
0,0.401254,0.006927,1.683336,"((1,), ' -> ', (42,))"
1,0.419028,0.011898,1.757904,"((86,), ' -> ', (42,))"
2,0.416554,0.017507,1.747522,"((38,), ' -> ', (42,))"
3,0.456464,0.01102,1.914955,"((73,), ' -> ', (42,))"
4,0.443609,0.007278,1.861024,"((2, 58), ' -> ', (42,))"
5,0.416938,0.009938,2.394681,"((38, 42), ' -> ', (2,))"
6,0.435374,0.007722,1.826477,"((2, 38), ' -> ', (42,))"
7,0.416554,0.03506,6.974483,"((38,), ' -> ', (2, 42))"
8,0.416938,0.009938,2.394681,"((38, 42), ' -> ', (2,))"
9,0.435374,0.007722,1.826477,"((2, 38), ' -> ', (42,))"


# 2. Tom Brijs Retail Data Set

In [19]:
retail_df = pd.read_csv('Data/retail.dat', header=None, names=['transaction'])
retail_df = retail_df.drop([0])

In [20]:
print(retail_df.size)
retail_df.head()

88161


Unnamed: 0,transaction
1,30 31 32
2,33 34 35
3,36 37 38 39 40 41 42 43 44 45 46
4,38 39 47 48
5,38 39 48 49 50 51 52 53 54 55 56 57 58


In [21]:
%%time
support_count = 1500
APR_retail = APRIORI(retail_df.copy(), support_count, delimiter=' ')

Creating single frequent sets: 100%|██████████████████████████████████████████| 88161/88161 [00:08<00:00, 10857.31it/s]


Wall time: 8.53 s


In [22]:
print(f'Found {sum([len(Lk) for Lk in APR_retail.frequent_sets])} frequent sets')
for Lk in APR_retail.frequent_sets[-1:]:
    for x in Lk[:5]:
        print(x)

Found 23 frequent sets
({32}, 15167, 1)
({36}, 2936, 1)
({38}, 15596, 1)
({39}, 50675, 1)
({41}, 14945, 1)


In [23]:
%%time
APR_retail.RUN()

Expanding frequent sets:  14%|███████▉                                                  | 3/22 [00:08<00:53,  2.81s/it]


Wall time: 8.42 s


In [24]:
print(f'After APRIORI {sum([len(Lk) for Lk in APR_retail.frequent_sets])} frequent sets')
for Lk in APR_retail.frequent_sets[-1:]:
    for x in Lk[:5]:
        print(x)

After APRIORI 73 frequent sets
({48, 41, 38, 39}, 1991, 4)
({48, 41, 32, 39}, 1646, 4)
({32, 41, 48, 39}, 1646, 4)


In [25]:
pd.DataFrame.from_dict(APR_retail.create_assosiation_rules(set_size=retail_df.shape[0], confidence_threshold=0.8))

Unnamed: 0,confidence,leverage,lift,rule
0,0.950272,0.025755,5.371696,"((36,), ' -> ', (38,))"
1,0.975304,0.025303,5.513195,"((110,), ' -> ', (38,))"
2,0.978057,0.028162,5.528759,"((170,), ' -> ', (38,))"
3,0.954836,0.017974,5.39749,"((36, 39), ' -> ', (38,))"
4,0.816811,0.024755,1.421033,"((41, 48), ' -> ', (39,))"
5,0.989198,0.016207,5.591736,"((39, 110), ' -> ', (38,))"
6,0.989198,0.016207,5.591736,"((39, 110), ' -> ', (38,))"
7,0.980573,0.01877,5.542979,"((39, 170), ' -> ', (38,))"
8,0.987797,0.014321,5.583815,"((48, 170), ' -> ', (38,))"
9,0.838669,0.007105,1.45906,"((38, 41, 48), ' -> ', (39,))"


# 3. Korsarak

In [26]:
%%time
kosarak_df = pd.read_csv('Data/kosarak.dat', header=None)

Wall time: 535 ms


In [27]:
print(kosarak_df.shape)
kosarak_df.head()

(990002, 1)


Unnamed: 0,0
0,1 2 3
1,1
2,4 5 6 7
3,1 8
4,9 10


In [35]:
%%time
support_count = 10000
APR_kosarak = APRIORI(kosarak_df.copy(), support_count)

Creating single frequent sets: 100%|████████████████████████████████████████| 990002/990002 [01:32<00:00, 10760.50it/s]


Wall time: 1min 37s


In [36]:
print(f'Found {sum([len(Lk) for Lk in APR_kosarak.frequent_sets])} frequent sets')
for Lk in APR_kosarak.frequent_sets[-1:]:
    for x in Lk[:5]:
        print(x)

Found 54 frequent sets
({1}, 197522, 1)
({2}, 42927, 1)
({3}, 450031, 1)
({4}, 78097, 1)
({6}, 601374, 1)


In [37]:
%%time
APR_kosarak.RUN()

Creating Ck: 100%|███████████████████████████████████████████████████████████| 990002/990002 [05:26<00:00, 3030.12it/s]
Creating Ck: 100%|███████████████████████████████████████████████████████████| 837206/837206 [07:26<00:00, 1873.75it/s]
Creating Ck: 100%|███████████████████████████████████████████████████████████| 638834/638834 [03:00<00:00, 3544.11it/s]
Creating Ck: 100%|██████████████████████████████████████████████████████████| 465030/465030 [00:20<00:00, 22211.70it/s]
Creating Ck: 100%|█████████████████████████████████████████████████████████| 345114/345114 [00:00<00:00, 402027.21it/s]


Wall time: 16min 15s


In [39]:
print(f'After APRIORI {sum([len(Lk) for Lk in APR_kosarak.frequent_sets])} frequent sets')
for Lk in APR_kosarak.frequent_sets[-1:]:
    for x in Lk[:5]:
        print(x)

After APRIORI 418 frequent sets
({3, 6, 7, 11, 27}, 13574, 5)
({1, 6, 7, 11, 27}, 10347, 5)
({1, 3, 6, 148, 218}, 10638, 5)
({1, 6, 11, 148, 218}, 22444, 5)
({1, 3, 6, 11, 148}, 10493, 5)


In [51]:
pd.DataFrame.from_dict(APR_kosarak.create_assosiation_rules(set_size=kosarak_df.shape[0], confidence_threshold=0.90))

Unnamed: 0,confidence,leverage,lift,rule
0,0.912458,0.003410,1.502119,"((32,), ' -> ', (6,))"
1,0.924508,0.007713,1.521956,"((40,), ' -> ', (6,))"
2,0.961187,0.004539,1.582339,"((49,), ' -> ', (6,))"
3,0.961846,0.005639,1.583423,"((56,), ' -> ', (6,))"
4,0.903142,0.006759,1.486782,"((69,), ' -> ', (6,))"
5,0.912898,0.005703,1.502843,"((87,), ' -> ', (6,))"
6,0.930802,0.010521,1.532318,"((83,), ' -> ', (6,))"
7,0.965926,0.006631,1.590139,"((135,), ' -> ', (6,))"
8,0.964917,0.007966,1.588479,"((90,), ' -> ', (6,))"
9,0.962763,0.004125,1.584933,"((155,), ' -> ', (6,))"


# 4. Ta-Feng.

In [168]:
columns=['date', 'customerID', 'Age', 'Residence', 'productSubclass', 'productID', 'Amount', 'Asset', 'Price']
ta_feng_df1 = pd.read_csv('Data/Ta-Feng/D01', sep=';', names=columns)
ta_feng_df2 = pd.read_csv('Data/Ta-Feng/D02', sep=';', names=columns)
ta_feng_df3 = pd.read_csv('Data/Ta-Feng/D11', sep=';', names=columns)
ta_feng_df4 = pd.read_csv('Data/Ta-Feng/D12', sep=';', names=columns)

In [169]:
print(ta_feng_df1.shape, ta_feng_df2.shape, ta_feng_df3.shape, ta_feng_df4.shape)
ta_feng_df1.head()

(216864, 9) (199039, 9) (223622, 9) (178216, 9)


Unnamed: 0,date,customerID,Age,Residence,productSubclass,productID,Amount,Asset,Price
0,2001-01-01 00:00:00,141833,F,F,130207,4710105011011,2,44,52
1,2001-01-01 00:00:00,1376753,E,E,110217,4710265849066,1,150,129
2,2001-01-01 00:00:00,1603071,E,G,100201,4712019100607,1,35,39
3,2001-01-01 00:00:00,1738667,E,F,530105,4710168702901,1,94,119
4,2001-01-01 00:00:00,2141497,A,B,320407,4710431339148,1,100,159


In [171]:
%%time
ta_feng_df1 = ta_feng_df1.groupby(['customerID', 'date'], as_index=False).agg(lambda x: x.tolist())
ta_feng_df2 = ta_feng_df2.groupby(['customerID', 'date'], as_index=False).agg(lambda x: x.tolist())
ta_feng_df3 = ta_feng_df3.groupby(['customerID', 'date'], as_index=False).agg(lambda x: x.tolist())
ta_feng_df4 = ta_feng_df4.groupby(['customerID', 'date'], as_index=False).agg(lambda x: x.tolist())

Wall time: 54.9 s


In [172]:
ta_feng_df1.head()

Unnamed: 0,customerID,date,Age,Residence,productSubclass,productID,Amount,Asset,Price
0,1069,2001-01-21 00:00:00,"[K , K , K ]","[E , E , E ]","[110333, 100311, 110333]","[4710320224661, 4710022101208, 4712603661644]","[1, 1, 1]","[361, 197, 313]","[425, 198, 348]"
1,1113,2001-01-06 00:00:00,"[K , K , K , K , K , K , K ]","[F , F , F , F , F , F , F ]","[110109, 110136, 110605, 100312, 110401, 11060...","[4710088620750, 4710008251125, 4710254015021, ...","[1, 1, 2, 1, 1, 1, 2]","[161, 23, 68, 40, 135, 21, 68]","[188, 28, 84, 47, 169, 28, 84]"
2,1823,2001-01-24 00:00:00,"[K , K , K ]","[D , D , D ]","[720317, 500201, 110114]","[78698703015, 4710114128038, 4710126392175]","[1, 1, 1]","[55, 138, 148]","[79, 169, 185]"
3,2189,2001-01-03 00:00:00,"[K , K , K , K , K , K , K , K , K , K , K , K...","[B , B , B , B , B , B , B , B , B , B , B , B...","[500210, 500202, 712901, 530403, 501002, 76053...","[4710114105046, 4710036009071, 4902704881052, ...","[4, 4, 3, 2, 1, 2, 3, 3, 3, 2, 1, 1, 1, 1, 4, ...","[564, 400, 54, 656, 27, 164, 261, 627, 150, 62...","[624, 432, 66, 798, 40, 238, 297, 684, 207, 90..."
4,4282,2001-01-13 00:00:00,"[J , J , J , J , J , J , J ]","[E , E , E , E , E , E , E ]","[530405, 500307, 530209, 530412, 500201, 50020...","[4710363609005, 4710466101130, 4714058833126, ...","[1, 1, 1, 1, 1, 1, 1]","[82, 82, 90, 65, 138, 133, 55]","[115, 125, 99, 85, 148, 155, 69]"


In [175]:
ta_feng_df1 = ta_feng_df1[['productID']]
ta_feng_df2 = ta_feng_df2[['productID']]
ta_feng_df3 = ta_feng_df3[['productID']]
ta_feng_df4 = ta_feng_df4[['productID']]

ta_feng_df1['productID'] = ta_feng_df1['productID'].apply(lambda row: ' '.join(map(str, row)))
ta_feng_df2['productID'] = ta_feng_df2['productID'].apply(lambda row: ' '.join(map(str, row)))
ta_feng_df3['productID'] = ta_feng_df3['productID'].apply(lambda row: ' '.join(map(str, row)))
ta_feng_df4['productID'] = ta_feng_df4['productID'].apply(lambda row: ' '.join(map(str, row)))

In [176]:
ta_feng_df1.head()

Unnamed: 0,productID
0,4710320224661 4710022101208 4712603661644
1,4710088620750 4710008251125 4710254015021 3700...
2,78698703015 4710114128038 4710126392175
3,4710114105046 4710036009071 4902704881052 5610...
4,4710363609005 4710466101130 4714058833126 4713...


# 4.1

In [193]:
%%time
support_count = 150
APR_ta_feng1 = APRIORI(ta_feng_df1.copy(), support_count)
print(f'Found {sum([len(Lk) for Lk in APR_ta_feng1.frequent_sets])} frequent sets')
for Lk in APR_ta_feng1.frequent_sets[-1:]:
    for x in Lk[:5]:
        print(x)

Creating single frequent sets: 100%|██████████████████████████████████████████| 29901/29901 [00:02<00:00, 10555.08it/s]


Found 152 frequent sets
({4710022101208}, 357, 1)
({37000440147}, 181, 1)
({4710114128038}, 428, 1)
({4710114105046}, 239, 1)
({4710036009071}, 163, 1)
Wall time: 2.95 s


In [194]:
APR_ta_feng1.RUN()
print(f'After APRIORI {sum([len(Lk) for Lk in APR_ta_feng1.frequent_sets])} frequent sets')
for Lk in APR_ta_feng1.frequent_sets[-1:]:
    for x in Lk[:5]:
        print(x)

Creating Ck: 100%|██████████████████████████████████████████████████████████████| 29901/29901 [01:25<00:00, 350.79it/s]
Creating Ck: 100%|███████████████████████████████████████████████████████████| 25479/25479 [00:00<00:00, 277588.75it/s]
Creating Ck: 100%|██████████████████████████████████████████████████████████| 21747/21747 [00:00<00:00, 2422927.51it/s]


After APRIORI 166 frequent sets
({4710011401128, 4710011405133, 4710011401135}, 154, 3)


In [195]:
pd.DataFrame.from_dict(APR_ta_feng1.create_assosiation_rules(set_size=ta_feng_df1.shape[0], confidence_threshold=0.6))

Unnamed: 0,confidence,leverage,lift,rule
0,0.693267,0.009021,33.651577,"((4710011405133,), ' -> ', (4710011401128,))"
1,0.605128,0.007624,29.373277,"((4710011406123,), ' -> ', (4710011401128,))"
2,0.726688,0.007344,35.273865,"((4710011409056,), ' -> ', (4710011401128,))"
3,0.772973,0.00931,37.52056,"((4710011401135,), ' -> ', (4710011401128,))"
4,0.618454,0.008022,30.515493,"((4710018004704,), ' -> ', (4710018004605,))"
5,0.781726,0.005015,37.945431,"((4710011401135, 4710011405133), ' -> ', (4710..."


# 4.2

In [196]:
%%time
support_count = 150
APR_ta_feng2 = APRIORI(ta_feng_df2.copy(), support_count)
print(f'Found {sum([len(Lk) for Lk in APR_ta_feng2.frequent_sets])} frequent sets')
for Lk in APR_ta_feng2.frequent_sets[-1:]:
    for x in Lk[:5]:
        print(x)

Creating single frequent sets: 100%|██████████████████████████████████████████| 31051/31051 [00:03<00:00, 10208.10it/s]


Found 142 frequent sets
({4710011402026}, 164, 1)
({4710088620156}, 283, 1)
({4712162000038}, 551, 1)
({4710085120703}, 189, 1)
({4710085120628}, 177, 1)
Wall time: 3.15 s


In [197]:
%%time
APR_ta_feng2.RUN()
print(f'After APRIORI {sum([len(Lk) for Lk in APR_ta_feng2.frequent_sets])} frequent sets')
for Lk in APR_ta_feng2.frequent_sets[-1:]:
    for x in Lk[:5]:
        print(x)

Creating Ck: 100%|██████████████████████████████████████████████████████████████| 31051/31051 [01:15<00:00, 412.24it/s]
Creating Ck: 100%|███████████████████████████████████████████████████████████| 26266/26266 [00:00<00:00, 392915.42it/s]


After APRIORI 151 frequent sets
({4710036003581, 4714981010038}, 267, 2)
({4711271000014, 4714981010038}, 369, 2)
({4710060000099, 4714981010038}, 255, 2)
({4711856000088, 4711856000125}, 296, 2)
({4711856020208, 4711856000088}, 198, 2)
Wall time: 1min 15s


In [210]:
pd.DataFrame.from_dict(APR_ta_feng2.create_assosiation_rules(set_size=ta_feng_df2.shape[0], confidence_threshold=0.4))

Unnamed: 0,confidence,leverage,lift,rule
0,0.424626,0.008447,3.457923,"((4711271000014,), ' -> ', (4714981010038,))"
1,0.435897,0.005899,3.549712,"((4710060000099,), ' -> ', (4714981010038,))"
2,0.4625,0.008937,15.992302,"((4711856000088,), ' -> ', (4711856000125,))"
3,0.426891,0.007626,14.761008,"((4711856020208,), ' -> ', (4711856000125,))"
4,0.464991,0.007822,16.078437,"((4711856020215,), ' -> ', (4711856000125,))"


# 4.3

In [199]:
%%time
support_count = 150
APR_ta_feng3 = APRIORI(ta_feng_df3.copy(), support_count)
print(f'Found {sum([len(Lk) for Lk in APR_ta_feng3.frequent_sets])} frequent sets')
for Lk in APR_ta_feng3.frequent_sets[-1:]:
    for x in Lk[:5]:
        print(x)

Creating single frequent sets: 100%|██████████████████████████████████████████| 31860/31860 [00:02<00:00, 11043.53it/s]


Found 163 frequent sets
({4711271000014}, 2359, 1)
({4710114105046}, 308, 1)
({4712425010712}, 500, 1)
({4710114606048}, 308, 1)
({4710085172702}, 150, 1)
Wall time: 3.01 s


In [200]:
%%time
APR_ta_feng3.RUN()
print(f'After APRIORI {sum([len(Lk) for Lk in APR_ta_feng3.frequent_sets])} frequent sets')
for Lk in APR_ta_feng3.frequent_sets[-1:]:
    for x in Lk[:5]:
        print(x)

Creating Ck: 100%|██████████████████████████████████████████████████████████████| 31860/31860 [01:40<00:00, 316.71it/s]
Creating Ck: 100%|██████████████████████████████████████████████████████████| 27319/27319 [00:00<00:00, 2490419.28it/s]


After APRIORI 167 frequent sets
({4711663700010, 4714981010038}, 230, 2)
({4710421090059, 4711271000014}, 284, 2)
({4710011401128, 4710011401135}, 160, 2)
({4710085120680, 4710085120697}, 209, 2)
Wall time: 1min 40s


In [208]:
pd.DataFrame.from_dict(APR_ta_feng3.create_assosiation_rules(set_size=ta_feng_df3.shape[0], confidence_threshold=0.6))

Unnamed: 0,confidence,leverage,lift,rule
0,0.727273,0.004936,58.218365,"((4710011401135,), ' -> ', (4710011401128,))"
1,0.76,0.006443,56.180046,"((4710085120697,), ' -> ', (4710085120680,))"


# 4.4

In [202]:
%%time
support_count = 150
APR_ta_feng4 = APRIORI(ta_feng_df4.copy(), support_count)
print(f'Found {sum([len(Lk) for Lk in APR_ta_feng4.frequent_sets])} frequent sets')
for Lk in APR_ta_feng4.frequent_sets[-1:]:
    for x in Lk[:5]:
        print(x)

Creating single frequent sets: 100%|██████████████████████████████████████████| 26766/26766 [00:02<00:00, 10910.90it/s]


Found 118 frequent sets
({4710088410139}, 419, 1)
({4710094699078}, 252, 1)
({4710114606048}, 298, 1)
({4710466103080}, 244, 1)
({4710043552102}, 163, 1)
Wall time: 2.56 s


In [204]:
%%time
APR_ta_feng4.RUN()
print(f'After APRIORI {sum([len(Lk) for Lk in APR_ta_feng4.frequent_sets])} frequent sets')
for Lk in APR_ta_feng4.frequent_sets[-1:]:
    for x in Lk[:5]:
        print(x)

Creating Ck: 100%|██████████████████████████████████████████████████████████████| 26766/26766 [00:44<00:00, 600.54it/s]
Creating Ck: 100%|██████████████████████████████████████████████████████████| 22659/22659 [00:00<00:00, 2445418.24it/s]


After APRIORI 120 frequent sets
({4710011401128, 4710011401135}, 150, 2)
({4711271000014, 4710683100015}, 283, 2)
Wall time: 44.6 s


In [214]:
pd.DataFrame.from_dict(APR_ta_feng4.create_assosiation_rules(set_size=ta_feng_df4.shape[0], confidence_threshold=0.1))

Unnamed: 0,confidence,leverage,lift,rule
0,0.735294,0.005501,54.51768,"((4710011401135,), ' -> ', (4710011401128,))"
1,0.415512,0.005501,54.51768,"((4710011401128,), ' -> ', (4710011401135,))"
2,0.111155,0.008398,4.861386,"((4711271000014,), ' -> ', (4710683100015,))"
3,0.462418,0.008398,4.861386,"((4710683100015,), ' -> ', (4711271000014,))"
