In [2]:
# Set up
%pylab
%pylab inline
%matplotlib inline
%load_ext autoreload
%autoreload 2
import tqdm
import random
import pandas as pd
from collections import Counter
from itertools import cycle, combinations
from operator import itemgetter

from sklearn import datasets, metrics, tree

import spectral
import seaborn as sns 
import tqdm
import copy 
import scipy

Using matplotlib backend: MacOSX
Populating the interactive namespace from numpy and matplotlib
Populating the interactive namespace from numpy and matplotlib


In [3]:
class APRIORI:
    def __init__(self, transactions, support_count=100, delimiter=' '):
        self.records_no = len(transactions)
        self.support_count = support_count
        self.data = transactions
        self.columns = transactions.columns
        self.delimiter = delimiter
        
        self.items_count = self.get_items_count()
        
        self.frequent_sets = pd.DataFrame(
            {'items': self.items_count.index, 
             'support_count': self.items_count.values, 
             'set_size': 1}
        )
        
        self.data['set_size'] = self.data[self.columns[0]].str.count(delimiter) + 1
        self.data[self.columns[0]] = self.data[self.columns[0]].apply( \
                                     lambda row: set(map(str, row.split(delimiter))))
        
        
        # L1 = frequent item sets with 1 elements
        self.single_items_set = set(self.items_count.index)
    
    
    def get_items_count(self):
        single_items = self.data[self.columns[0]] \
                       .str.strip().str.split(self.delimiter, expand=True) \
                       .apply(pd.value_counts).sum(axis=1) \
                       .where(lambda value: value >= self.support_count).dropna()
        return single_items
    
    
    def gen_Ck(self, k):
        Lk = self.frequent_sets[self.frequent_sets['set_size'] == k - 1]['items']
        res = set()
        a = time.time()
#         print(f'Generating time: {a}')
        for i in range(len(Lk)):
            x = Lk.iloc[i]
            for j in range(i, len(Lk)):
                y = Lk.iloc[j]
                if type(x) is not tuple:
                    x = (x,)
                if type(y) is not tuple:
                    y = (y,)
                united = tuple(set(x + y))
                if len(united) == k:
                    res.add(tuple(sorted(united)))
#         print(f'Ended, time = {time.time() - a}')
        return list(res)
                
    
    
    def RUN(self):
        length = 0
        for length in tqdm.tqdm(range(2, len(self.single_items_set) + 1), \
                                desc=f'Expanding frequent sets', position=0, leave=True):

            self.data = self.data[self.data['set_size'] >= length]
            Ck = self.gen_Ck(length)

            d = self.data[self.columns[0]] \
                .apply(lambda st: pd.Series(s if set(s).issubset(st) else None \
                                            for s in Ck)) \
                .apply(lambda col: [col.dropna().unique()[0], col.count()] \
                       if col.count() >= self.support_count else None).dropna()
            if d.empty:
                break
                
            self.frequent_sets = self.frequent_sets.append(pd.DataFrame(
                {'items': list(map(itemgetter(0), d.values)), 
                 'support_count': list(map(itemgetter(1), d.values)),
                 'set_size': length
                }), ignore_index=True)
            
            
    def create_assosiation_rules(self, set_size, confidence_threshold=0.4):
        support = {}
        assosiation_rules = []
        
        items = self.frequent_sets.columns[0]
        supp_cnt = self.frequent_sets.columns[1]

        for x, supp in zip(self.frequent_sets[items], self.frequent_sets[supp_cnt]):
            S = []
            if type(x) is not tuple:
                x = (int(x), )
            else:
                x = tuple(map(int, x))
            support[tuple(sorted(x))] = supp
            S.append([(x, None)])

            for k in range(0, len(x) - 1):
                S.append([])
                for rule in S[k]:
                    A = rule[0]
                    B = rule[1]
                    for i in range(len(A)):
                        tmp = B
                        if B is None:
                            tmp = (A[i], )
                        else: 
                            tmp += (A[i], )

                        A = tuple(sorted(set(A)))
                        tmp = tuple(sorted(set(tmp)))
                        sup_AC = support[A] / set_size
                        sup_A = support[(A[:i] + A[i+1:])] / set_size
                        sup_C = support[tmp] / set_size
                        confidence = sup_AC / sup_A
                        lift = confidence / sup_C
                        leverage = sup_AC - sup_A * sup_C

                        if confidence > confidence_threshold:
                            S[k + 1].append((A[:i] + A[i+1:], tmp))
                            record = {'rule': str((A[:i] + A[i+1:], tmp)), 
                                     'confidence': confidence,
                                     'lift': lift,
                                     'leverage': leverage
                                     } 
                            assosiation_rules.append(record)

        assosiation_rules = pd.DataFrame.from_dict(assosiation_rules)
        return assosiation_rules

# 0. Testing on wikipedia data <br/>
https://en.wikipedia.org/wiki/Apriori_algorithm

In [3]:
df = []
with open('Data/test.txt') as f:
    for x in f:
        df.append(x.split())
        
df = pd.DataFrame(df)
df

Unnamed: 0,0
0,1234
1,124
2,12
3,234
4,23
5,34
6,24


In [4]:
B = APRIORI(df.copy(), support_count=3, delimiter=',')
B.RUN()

Expanding frequent sets:   0%|          | 0/3 [00:00<?, ?it/s]


In [5]:
B.frequent_sets

Unnamed: 0,items,support_count,set_size
0,1,3.0,1
1,2,6.0,1
2,3,4.0,1
3,4,5.0,1
4,"(3, 4)",3.0,2
5,"(1, 2)",3.0,2
6,"(2, 4)",4.0,2
7,"(2, 3)",3.0,2


In [6]:
B.create_assosiation_rules(set_size=df.shape[0], confidence_threshold=0.7)

Unnamed: 0,rule,confidence,lift,leverage
0,"((3,), (4,))",0.75,1.05,0.020408
1,"((1,), (2,))",1.0,1.166667,0.061224
2,"((4,), (2,))",0.8,0.933333,-0.040816
3,"((3,), (2,))",0.75,0.875,-0.061224


# 1. Some store DF

In [7]:
 def process_data(df):
        '''
        Returns pair X, Y, where
        X : list of transactions,
            where transaction is a list of products
            
        Y: unique list of all products
        '''
        records = []
        all_products = set()
        records_no, attrs_no = df.shape
        
        for i in range(records_no):
            row = df.iloc[i][df.iloc[i].notna()]
            records.append([str(x) for x in row])
            all_products = all_products.union(set(row))
            
        return records, all_products
    
    
def map_items_to_numbers(transactions, all_products):
    number_to_string = {}
    string_to_number = {}
    for i, item in enumerate(all_products):
        string_to_number[item] = i
        number_to_string[i] = item

    for i in range(len(transactions)):
        transactions[i] = list(map(lambda x: string_to_number[x],
                                            transactions[i]))

    return transactions, string_to_number, number_to_string

In [8]:
store_df = pd.read_csv('Data/store_data.csv', header=None)
store_df.shape

(7501, 20)

In [9]:
store_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
1,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
2,chutney,,,,,,,,,,,,,,,,,,,
3,turkey,avocado,,,,,,,,,,,,,,,,,,
4,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,


In [10]:
# map all transaction to numbers
transactions, all_products = process_data(store_df)

transactions, string_to_number, number_to_string = \
map_items_to_numbers(transactions, all_products)

transactions = [' '.join(map(str, row)) for row in transactions]
transactions = pd.DataFrame(transactions)

In [11]:
transactions.columns = ['items']
transactions.head()

Unnamed: 0,items
0,117 118 75 38 63 110 102 15 21 17 74 3 101 44 ...
1,52 73 106
2,45
3,48 75
4,97 113 28 95 3


In [12]:
support_count = 100
APR = APRIORI(transactions.copy(), support_count)

In [13]:
print(f'Found {APR.frequent_sets.size} frequent sets')
APR.frequent_sets.head()

Found 198 frequent sets


Unnamed: 0,items,support_count,set_size
0,0,737.0,1
1,10,160.0,1
2,100,149.0,1
3,101,356.0,1
4,103,211.0,1


In [14]:
%%time
APR.RUN()
fs = APR.frequent_sets

Expanding frequent sets:   3%|▎         | 2/65 [00:19<11:21, 10.81s/it]

CPU times: user 18.8 s, sys: 535 ms, total: 19.4 s
Wall time: 20.4 s


In [15]:
fs.tail()

Unnamed: 0,items,support_count,set_size
182,"(106, 29, 97)",101.0,3
183,"(16, 29, 97)",119.0,3
184,"(113, 29, 97)",105.0,3
185,"(106, 16, 97)",107.0,3
186,"(0, 16, 97)",128.0,3


In [17]:
APR.create_assosiation_rules(set_size=transactions.shape[0], confidence_threshold=0.4)

Unnamed: 0,rule,confidence,lift,leverage
0,"((50,), (97,))",0.419028,1.757904,0.011898
1,"((0,), (97,))",0.416554,1.747522,0.017507
2,"((7,), (97,))",0.401254,1.683336,0.006927
3,"((31,), (97,))",0.456464,1.914955,0.01102
4,"((16, 113), (97,))",0.443609,1.861024,0.007278
5,"((29, 106), (97,))",0.405622,1.701663,0.005552
6,"((16, 29), (97,))",0.404762,1.698053,0.006522
7,"((29, 113), (97,))",0.435685,1.82778,0.00634
8,"((0, 97), (16,))",0.416938,2.394681,0.009938
9,"((0, 16), (97,))",0.435374,1.826477,0.007722


In [18]:
set_size = 3
big_sets = fs[fs['set_size'] >= set_size].copy()
big_sets['items'] = big_sets['items'].apply(lambda row: list(map(lambda x: number_to_string[int(x)], row)))
big_sets

Unnamed: 0,items,support_count,set_size
181,"[milk, spaghetti, mineral water]",118.0,3
182,"[eggs, chocolate, mineral water]",101.0,3
183,"[spaghetti, chocolate, mineral water]",119.0,3
184,"[milk, chocolate, mineral water]",105.0,3
185,"[eggs, spaghetti, mineral water]",107.0,3
186,"[ground beef, spaghetti, mineral water]",128.0,3


# 2. Tom Brijs Retail Data Set

In [19]:
retail_df = pd.read_csv('Data/retail.dat', header=None, names=['transaction'])
retail_df = retail_df.drop([0])

In [20]:
print(retail_df.size)
retail_df.head()

88161


Unnamed: 0,transaction
1,30 31 32
2,33 34 35
3,36 37 38 39 40 41 42 43 44 45 46
4,38 39 47 48
5,38 39 48 49 50 51 52 53 54 55 56 57 58


In [38]:
support_count = 1500
APR_retail = APRIORI(retail_df.copy(), support_count, delimiter=' ')

In [39]:
print(f'Found {APR_retail.frequent_sets.size} frequent sets')
APR_retail.frequent_sets.head()

Found 69 frequent sets


Unnamed: 0,items,support_count,set_size
0,101,2237.0,1
1,110,2794.0,1
2,1327,1786.0,1
3,147,1779.0,1
4,170,3099.0,1


In [40]:
%%time
APR_retail.RUN()
fs_retail = APR_retail.frequent_sets

Expanding frequent sets:  14%|█▎        | 3/22 [01:49<12:30, 39.52s/it]

CPU times: user 2min 4s, sys: 2.37 s, total: 2min 7s
Wall time: 2min 9s


In [41]:
set_size = 3
fs_retail[fs_retail['set_size'] >= set_size].copy()

Unnamed: 0,items,support_count,set_size
51,"(32, 38, 48)",1646.0,3
52,"(38, 41, 48)",2374.0,3
53,"(32, 39, 41)",2359.0,3
54,"(38, 39, 48)",6102.0,3
55,"(39, 48, 89)",2125.0,3
56,"(38, 39, 41)",3051.0,3
57,"(32, 41, 48)",2063.0,3
58,"(32, 39, 48)",5402.0,3
59,"(36, 38, 39)",1945.0,3
60,"(170, 38, 39)",2019.0,3


In [42]:
APR_retail.create_assosiation_rules(set_size=retail_df.shape[0], confidence_threshold=0.8)

Unnamed: 0,rule,confidence,lift,leverage
0,"((110,), (110,))",0.975304,30.774444,0.029905
1,"((170,), (170,))",0.978057,27.823983,0.033145
2,"((36,), (38,))",0.950272,5.371696,0.025755
3,"((36, 39), (38,))",0.954836,5.39749,0.017974
4,"((39, 170), (170,))",0.980573,27.895548,0.02208
5,"((39, 110), (110,))",0.989198,31.212856,0.019104
6,"((41, 48), (39,))",0.816811,1.421033,0.024755
7,"((48, 170), (170,))",0.987797,28.101057,0.016825
8,"((38, 41, 48), (39,))",0.838669,1.45906,0.007105


# 3. Korsarak

In [12]:
%%time
kosarak_df = pd.read_csv('Data/kosarak.dat', header=None)

CPU times: user 704 ms, sys: 74.8 ms, total: 779 ms
Wall time: 786 ms


In [13]:
kosarak_df.head()

Unnamed: 0,0
0,1 2 3
1,1
2,4 5 6 7
3,1 8
4,9 10


In [17]:
%%time
support_count = 100000
APR_kosarak = APRIORI(kosarak_df.copy(), support_count)

KeyboardInterrupt: 

In [None]:
print(f'Found {APR_kosarak.frequent_sets.size} frequent sets')
APR_kosarak.frequent_sets.head()