In [1]:
# Set up
%pylab
%pylab inline
%matplotlib inline
%load_ext autoreload
%autoreload 2
import tqdm
import random
import pandas as pd
from collections import Counter
from itertools import cycle, combinations
from operator import itemgetter

from sklearn import datasets, metrics, tree

import spectral
import seaborn as sns 
import tqdm
import copy 
import scipy

Using matplotlib backend: TkAgg
Populating the interactive namespace from numpy and matplotlib
Populating the interactive namespace from numpy and matplotlib


In [78]:
class APRIORI:
    def __init__(self, transactions, support_count=100, delimiter=' '):
        self.records_no = len(transactions)
        self.support_count = support_count
        self.data = transactions
        self.columns = transactions.columns
        self.delimiter = delimiter
        
        self.items_count = self.get_items_count()
        
        self.frequent_sets = pd.DataFrame(
            {'items': self.items_count.index, 
             'support_count': self.items_count.values, 
             'set_size': 1}
        )
        
        self.data['set_size'] = self.data[self.columns[0]].str.count(delimiter) + 1
        self.data[self.columns[0]] = self.data[self.columns[0]].apply( \
                                     lambda row: set(map(str, row.split(delimiter))))
        
        
        # L1 = frequent item sets with 1 elements
        self.single_items_set = set(self.items_count.index)
    
    
    def get_items_count(self):
        single_items = self.data[self.columns[0]] \
                       .str.strip().str.split(self.delimiter, expand=True) \
                       .apply(pd.value_counts).sum(axis=1) \
                       .where(lambda value: value >= self.support_count).dropna()
        return single_items
    
    
    def gen_Ck(self, k):
        Lk = self.frequent_sets[self.frequent_sets['set_size'] == k - 1]['items']
        res = set()
        a = time.time()
#         print(f'Generating time: {a}')
        for i in range(len(Lk)):
            x = Lk.iloc[i]
            for j in range(i, len(Lk)):
                y = Lk.iloc[j]
                if type(x) is not tuple:
                    x = (x,)
                if type(y) is not tuple:
                    y = (y,)
                united = tuple(set(x + y))
                if len(united) == k:
                    res.add(tuple(sorted(united)))
#         print(f'Ended, time = {time.time() - a}')
        return list(res)
                
    
    
    def RUN(self):
        length = 0
        for length in tqdm.tqdm(range(2, len(self.single_items_set) + 1), \
                                desc=f'Expanding frequent sets', position=0, leave=True):
#             print('Start')
            self.data = self.data[self.data['set_size'] >= length]
#             print('Got data, start CK')
            Ck = self.gen_Ck(length)
#             print('Got Ck')
            d = self.data[self.columns[0]] \
                .apply(lambda st: pd.Series(s if set(s).issubset(st) else None \
                                            for s in Ck)) \
                .apply(lambda col: [col.dropna().unique()[0], col.count()] \
                       if col.count() >= self.support_count else None).dropna()
            if d.empty:
                break
#             print('All ready')
            self.frequent_sets = self.frequent_sets.append(pd.DataFrame(
                {'items': list(map(itemgetter(0), d.values)), 
                 'support_count': list(map(itemgetter(1), d.values)),
                 'set_size': length
                }), ignore_index=True)



# 0. Testing on wikipedia data <br/>
https://en.wikipedia.org/wiki/Apriori_algorithm

In [3]:
df = []
with open('Data/test.txt') as f:
    for x in f:
        df.append(x.split())
        
df = pd.DataFrame(df)
df

Unnamed: 0,0
0,1234
1,124
2,12
3,234
4,23
5,34
6,24


In [4]:
B = APRIORI(df.copy(), support_count=1, delimiter=',')

In [5]:
B.frequent_sets

Unnamed: 0,items,support_count,set_size
0,1,3.0,1
1,2,6.0,1
2,3,4.0,1
3,4,5.0,1


In [6]:
B.RUN()

Expanding frequent sets: 100%|██████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 167.11it/s]


In [7]:
B.frequent_sets

Unnamed: 0,items,support_count,set_size
0,1,3.0,1
1,2,6.0,1
2,3,4.0,1
3,4,5.0,1
4,"(2, 4)",4.0,2
5,"(1, 3)",1.0,2
6,"(2, 3)",3.0,2
7,"(3, 4)",3.0,2
8,"(1, 2)",3.0,2
9,"(1, 4)",2.0,2


# 1. Some store DF

In [8]:
 def process_data(df):
        '''
        Returns pair X, Y, where
        X : list of transactions,
            where transaction is a list of products
            
        Y: unique list of all products
        '''
        records = []
        all_products = set()
        records_no, attrs_no = df.shape
        
        for i in range(records_no):
            row = df.iloc[i][df.iloc[i].notna()]
            records.append([str(x) for x in row])
            all_products = all_products.union(set(row))
            
        return records, all_products
    
    
def map_items_to_numbers(transactions, all_products):
    number_to_string = {}
    string_to_number = {}
    for i, item in enumerate(all_products):
        string_to_number[item] = i
        number_to_string[i] = item

    for i in range(len(transactions)):
        transactions[i] = list(map(lambda x: string_to_number[x],
                                            transactions[i]))

    return transactions, string_to_number, number_to_string

In [9]:
store_df = pd.read_csv('Data/store_data.csv', header=None)
store_df.shape

(7501, 20)

In [10]:
store_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
1,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
2,chutney,,,,,,,,,,,,,,,,,,,
3,turkey,avocado,,,,,,,,,,,,,,,,,,
4,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,


In [11]:
# map all transaction to numbers
transactions, all_products = process_data(store_df)

transactions, string_to_number, number_to_string = \
map_items_to_numbers(transactions, all_products)

transactions = [' '.join(map(str, row)) for row in transactions]
transactions = pd.DataFrame(transactions)

In [67]:
transactions.columns = ['items']
transactions.head()

Unnamed: 0,items
0,89 80 30 101 61 52 6 50 85 10 13 57 18 5 107 2...
1,118 40 39
2,83
3,106 30
4,107 12 109 59 57


In [68]:
support_count = 100
APR = APRIORI(transactions.copy(), support_count)

In [69]:
print(f'Found {APR.frequent_sets.size} frequent sets')
APR.frequent_sets.head()

Found 198 frequent sets


Unnamed: 0,items,support_count,set_size
0,10,228.0,1
1,100,351.0,1
2,101,193.0,1
3,102,117.0,1
4,103,450.0,1


In [26]:
%%time
APR.RUN()
fs = APR.frequent_sets

Expanding frequent sets:   3%|█▊                                                        | 2/65 [00:12<07:37,  7.27s/it]

Wall time: 13.2 s


In [27]:
set_size = 3
big_sets = fs[fs['set_size'] >= set_size].copy()
big_sets['items'] = big_sets['items'].apply(lambda row: list(map(lambda x: number_to_string[int(x)], row)))
big_sets

Unnamed: 0,items,support_count,set_size
181,"[mineral water, spaghetti, ground beef]",128.0,3
182,"[mineral water, milk, chocolate]",105.0,3
183,"[mineral water, spaghetti, eggs]",107.0,3
184,"[mineral water, spaghetti, chocolate]",119.0,3
185,"[mineral water, eggs, chocolate]",101.0,3
186,"[mineral water, spaghetti, milk]",118.0,3


# 2. Tom Brijs Retail Data Set

In [70]:
retail_df = pd.read_csv('Data/retail.dat', header=None, names=['transaction'])
retail_df = retail_df.drop([0])

In [85]:
print(retail_df.size)
retail_df.head()

88161


Unnamed: 0,transaction
1,30 31 32
2,33 34 35
3,36 37 38 39 40 41 42 43 44 45 46
4,38 39 47 48
5,38 39 48 49 50 51 52 53 54 55 56 57 58


In [81]:
support_count = 1000
APR = APRIORI(retail_df.copy(), support_count, delimiter=' ')

In [82]:
print(f'Found {APR.frequent_sets.size} frequent sets')
APR.frequent_sets.head()

Found 168 frequent sets


Unnamed: 0,items,support_count,set_size
0,1004,1102.0,1
1,101,2237.0,1
2,110,2794.0,1
3,1146,1426.0,1
4,117,1026.0,1


In [84]:
%%time
APR.RUN()
fs = APR.frequent_sets

Expanding frequent sets:   5%|███▏                                                      | 3/55 [02:26<50:25, 58.17s/it]

Wall time: 2min 39s
