In [1]:
# Set up
%pylab
%pylab inline
%matplotlib inline
%load_ext autoreload
%autoreload 2
import tqdm
import random
import pandas as pd
from collections import Counter
from itertools import cycle, combinations
from operator import itemgetter

from sklearn import datasets, metrics, tree

import spectral
import seaborn as sns 
import tqdm
import copy 
import scipy

Using matplotlib backend: TkAgg
Populating the interactive namespace from numpy and matplotlib
Populating the interactive namespace from numpy and matplotlib


In [2]:
class APRIORI:
    def __init__(self, transactions, support_count=100, delimiter=' '):
        self.records_no = len(transactions)
        self.support_count = support_count
        self.data = transactions
        self.columns = transactions.columns
        self.delimiter = delimiter
        
        self.items_count = self.get_items_count()
        
        self.frequent_sets = pd.DataFrame(
            {'items': self.items_count.index, 
             'support_count': self.items_count.values, 
             'set_size': 1}
        )
        
        self.data['set_size'] = self.data[self.columns[0]].str.count(delimiter) + 1
        self.data[self.columns[0]] = self.data[self.columns[0]].apply( \
                                     lambda row: set(map(str, row.split(delimiter))))
        
        
        # L1 = frequent item sets with 1 elements
        self.single_items_set = set(self.items_count.index)
    
    
    def get_items_count(self):
        single_items = self.data[self.columns[0]] \
                       .str.strip().str.split(self.delimiter, expand=True) \
                       .apply(pd.value_counts).sum(axis=1) \
                       .where(lambda value: value >= self.support_count).dropna()
        return single_items
    
    
    def gen_Ck(self, k):
        Lk = self.frequent_sets[self.frequent_sets['set_size'] == k - 1]['items']
        res = set()
        a = time.time()
#         print(f'Generating time: {a}')
        for i in range(len(Lk)):
            x = Lk.iloc[i]
            for j in range(i, len(Lk)):
                y = Lk.iloc[j]
                if type(x) is not tuple:
                    x = (x,)
                if type(y) is not tuple:
                    y = (y,)
                united = tuple(set(x + y))
                if len(united) == k:
                    res.add(tuple(sorted(united)))
#         print(f'Ended, time = {time.time() - a}')
        return list(res)
                
    
    
    def RUN(self):
        length = 0
        for length in tqdm.tqdm(range(2, len(self.single_items_set) + 1), \
                                desc=f'Expanding frequent sets', position=0, leave=True):
#             print('Start')
            self.data = self.data[self.data['set_size'] >= length]
#             print('Got data, start CK')
            Ck = self.gen_Ck(length)
#             print('Got Ck')
            d = self.data[self.columns[0]] \
                .apply(lambda st: pd.Series(s if set(s).issubset(st) else None \
                                            for s in Ck)) \
                .apply(lambda col: [col.dropna().unique()[0], col.count()] \
                       if col.count() >= self.support_count else None).dropna()
            if d.empty:
                break
#             print('All ready')
            self.frequent_sets = self.frequent_sets.append(pd.DataFrame(
                {'items': list(map(itemgetter(0), d.values)), 
                 'support_count': list(map(itemgetter(1), d.values)),
                 'set_size': length
                }), ignore_index=True)



# 0. Testing on wikipedia data <br/>
https://en.wikipedia.org/wiki/Apriori_algorithm

In [3]:
df = []
with open('Data/test.txt') as f:
    for x in f:
        df.append(x.split())
        
df = pd.DataFrame(df)
df

Unnamed: 0,0
0,1234
1,124
2,12
3,234
4,23
5,34
6,24


In [4]:
B = APRIORI(df.copy(), support_count=1, delimiter=',')

In [5]:
B.frequent_sets

Unnamed: 0,items,support_count,set_size
0,1,3.0,1
1,2,6.0,1
2,3,4.0,1
3,4,5.0,1


In [6]:
B.RUN()

Expanding frequent sets: 100%|██████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 167.37it/s]


In [7]:
B.frequent_sets

Unnamed: 0,items,support_count,set_size
0,1,3.0,1
1,2,6.0,1
2,3,4.0,1
3,4,5.0,1
4,"(1, 3)",1.0,2
5,"(1, 4)",2.0,2
6,"(1, 2)",3.0,2
7,"(2, 3)",3.0,2
8,"(2, 4)",4.0,2
9,"(3, 4)",3.0,2


# 1. Some store DF

In [8]:
 def process_data(df):
        '''
        Returns pair X, Y, where
        X : list of transactions,
            where transaction is a list of products
            
        Y: unique list of all products
        '''
        records = []
        all_products = set()
        records_no, attrs_no = df.shape
        
        for i in range(records_no):
            row = df.iloc[i][df.iloc[i].notna()]
            records.append([str(x) for x in row])
            all_products = all_products.union(set(row))
            
        return records, all_products
    
    
def map_items_to_numbers(transactions, all_products):
    number_to_string = {}
    string_to_number = {}
    for i, item in enumerate(all_products):
        string_to_number[item] = i
        number_to_string[i] = item

    for i in range(len(transactions)):
        transactions[i] = list(map(lambda x: string_to_number[x],
                                            transactions[i]))

    return transactions, string_to_number, number_to_string

In [9]:
store_df = pd.read_csv('Data/store_data.csv', header=None)
store_df.shape

(7501, 20)

In [10]:
store_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
1,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
2,chutney,,,,,,,,,,,,,,,,,,,
3,turkey,avocado,,,,,,,,,,,,,,,,,,
4,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,


In [11]:
# map all transaction to numbers
transactions, all_products = process_data(store_df)

transactions, string_to_number, number_to_string = \
map_items_to_numbers(transactions, all_products)

transactions = [' '.join(map(str, row)) for row in transactions]
transactions = pd.DataFrame(transactions)

In [12]:
transactions.columns = ['items']
transactions.head()

Unnamed: 0,items
0,117 105 86 1 48 63 4 89 71 83 61 3 30 76 111 5...
1,85 92 16
2,0
3,99 86
4,111 37 27 13 3


In [13]:
support_count = 100
APR = APRIORI(transactions.copy(), support_count)

In [14]:
print(f'Found {APR.frequent_sets.size} frequent sets')
APR.frequent_sets.head()

Found 198 frequent sets


Unnamed: 0,items,support_count,set_size
0,1,193.0,1
1,10,221.0,1
2,100,117.0,1
3,101,199.0,1
4,105,153.0,1


In [15]:
%%time
APR.RUN()
fs = APR.frequent_sets

Expanding frequent sets:   3%|█▊                                                        | 2/65 [00:12<07:36,  7.25s/it]

Wall time: 13.2 s


In [16]:
fs.tail()

Unnamed: 0,items,support_count,set_size
182,"(111, 24, 37)",105.0,3
183,"(108, 111, 112)",128.0,3
184,"(111, 112, 37)",118.0,3
185,"(111, 112, 24)",119.0,3
186,"(111, 112, 16)",107.0,3


In [21]:
def create_assosiation_rules(frequent_sets, confidence_threshold=0.4):
    support = {}
    assosiation_rules = []
        
    items = frequent_sets.columns[0]
    supp_cnt = frequent_sets.columns[1]
    
#     for x, supp in zip(frequent_sets[items], frequent_sets[supp_cnt]):
#         if type(x) is not tuple:
#             x = (int(x), )
#         else:
#             x = tuple(map(int, x))
#         support[x] = supp


    for x, supp in zip(frequent_sets[items], frequent_sets[supp_cnt]):
        S = []
        if type(x) is not tuple:
            x = (int(x), )
        else:
            x = tuple(map(int, x))
        support[tuple(sorted(x))] = supp
        S.append([(x, None)])
        
        for k in range(0, len(x) - 1):
            S.append([])
            for rule in S[k]:
                A = rule[0]
                B = rule[1]
                for i in range(len(A)):
                    tmp = B
                    if B is None:
                        tmp = (A[i], )
                    else: 
                        tmp += (A[i], )
                   
                    A = tuple(sorted(A))
                    tmp = tuple(sorted(tmp))
                    sup_AC = support[A]
                    sup_A = support[(A[:i] + A[i+1:])]
                    sup_C = support[tmp]
                    confidence = sup_AC / sup_A
                    lift = confidence / sup_C
                    leverage = sup_AC - sup_A * sup_C
                    
#                     print(sup_AC, sup_A, sup_C)
#                     print(A[:i] + A[i+1:], tmp, ' conf: ', confidence, lift, leverage)
                   
                    if confidence > confidence_threshold:
#                         print((A[:i] + A[i+1:], tmp))
                        S[k + 1].append((A[:i] + A[i+1:], tmp))
                        record = {'rule': str((A[:i] + A[i+1:], tmp)), 
                                 'confidence': confidence,
                                 'lift': lift,
                                 'leverage': leverage
                                 } 
                        assosiation_rules.append(record)
        
    assosiation_rules = pd.DataFrame.from_dict(assosiation_rules)
    return assosiation_rules
    

create_assosiation_rules(fs)

Unnamed: 0,confidence,leverage,lift,rule
0,0.456464,-677479.0,0.000255,"((51,), (111,))"
1,0.419028,-883065.0,0.000234,"((77,), (111,))"
2,0.401254,-570244.0,0.000224,"((58,), (111,))"
3,0.416554,-1317449.0,0.000233,"((108,), (111,))"
4,0.405622,-445111.0,0.000227,"((16, 24), (111,))"
5,0.435685,-430803.0,0.000244,"((24, 37), (111,))"
6,0.435374,-525544.0,0.000243,"((108, 112), (111,))"
7,0.416938,-400814.0,0.000319,"((108, 111), (112,))"
8,0.416554,-329869.0,0.00093,"((108,), (111, 112))"
9,0.443609,-475490.0,0.000248,"((37, 112), (111,))"


In [None]:
set_size = 3
big_sets = fs[fs['set_size'] >= set_size].copy()
big_sets['items'] = big_sets['items'].apply(lambda row: list(map(lambda x: number_to_string[int(x)], row)))
big_sets

# 2. Tom Brijs Retail Data Set

In [None]:
retail_df = pd.read_csv('Data/retail.dat', header=None, names=['transaction'])
retail_df = retail_df.drop([0])

In [None]:
print(retail_df.size)
retail_df.head()

In [None]:
support_count = 700
APR_retail = APRIORI(retail_df.copy(), support_count, delimiter=' ')

In [None]:
print(f'Found {APR_retail.frequent_sets.size} frequent sets')
APR_retail.frequent_sets.head()

In [None]:
%%time
APR_retail.RUN()
fs_retail = APR_retail.frequent_sets

In [None]:
set_size = 4
fs_retail[fs_retail['set_size'] >= set_size].copy()