In [1]:
import pandas as pd
import numpy as np
import uuid
from collections import Counter
import math
from progressbar import ProgressBar, AdaptiveETA, Percentage, Bar, RotatingMarker

In [2]:
df = pd.read_hdf('../data/processed/recipe_vectors.h5')

In [3]:
dat = df.values[:,:-8]

In [4]:
columns = list(df)
new_vec = []
for recipe in dat:
    ind = uuid.uuid4().hex
    for i in range(len(recipe)):
        if recipe[i] != 0:
            new_vec.append([ind, columns[i]])

In [5]:
print(len(dat))
basketized = pd.DataFrame(new_vec, columns=["index", "name"]).set_index('index', drop=False)
del basketized["index"]
basketized.head()

48817


Unnamed: 0_level_0,name
index,Unnamed: 1_level_1
364f94a3abda49d5aa4e308bcc8e0045,garlic
364f94a3abda49d5aa4e308bcc8e0045,mayonnaise
364f94a3abda49d5aa4e308bcc8e0045,pimiento
364f94a3abda49d5aa4e308bcc8e0045,sharp Cheddar
fbfb40ee3ed64158a256c466d342e6c2,almond


In [6]:
def generate_candidate_set(size, L):
    candidate_set = set()
    for i in range(len(L)):
        for j in range(i+1, len(L)):
            diff = L[i].difference(L[j])
            if len(diff) == 1:
                candidate_set.add(frozenset.union(L[i], L[j]))
    return candidate_set

def apriori(frame, support_per):
    candidate_sets = []
    frequent_sets = []
    thresh = math.floor(len(frame.groupby(level=0))*support_per)
    
    C1 = Counter()
    for ind, basket in frame.groupby(level=0):
        for item in basket.values:
            C1[item[0]] += 1
    
    L1 = []
    for item in C1.most_common():
        if item[1] > thresh:
            L1.append(set([item[0]]))
    print(L1)
    
    count = Counter()
    candidate = generate_candidate_set(2, L1)
    frequent = []
    size = 2
    while len(candidate) != 0:
        widgets = ['Counting C{}: '.format(size), Percentage(), ' ', Bar(marker=RotatingMarker()), ' ', AdaptiveETA()]
        pbar = ProgressBar(widgets=widgets)
        for ind, basket in pbar(frame.groupby(level=0)):
            for c in candidate:
                if basket["name"].isin(c).sum() == len(c):
                    count[c] += 1
        for item in count.most_common():
            if item[1] > thresh:
                frequent.append(set([item[0]]))
        candidate_sets.append(candidate)
        frequent_sets.append(frequent)
        size += 1
        count = Counter()
        candidate = generate_candidate_set(size, frequent)
        frequent = []
        
    return candidate_sets, frequent_sets

def apriori_quick(frame, vec_frame, support_per):
    candidate_sets = []
    frequent_sets = []
    thresh = math.floor(len(frame.groupby(level=0))*support_per)
    ing_map = {}
    dat = vec_frame.values
    columns = list(vec_frame)
    for i in range(len(columns)-8):
        ing_map[columns[i]] = i
    
    C1 = Counter()
    for ind, basket in frame.groupby(level=0):
        for item in basket.values:
            C1[item[0]] += 1
    
    L1 = []
    for item in C1.most_common():
        if item[1] > thresh:
            L1.append(frozenset([item[0]]))
    print(L1)
    
    count = Counter()
    candidate = generate_candidate_set(2, L1)
    frequent = []
    size = 2
    while len(candidate) != 0:
        widgets = ['Counting C{}: '.format(size), Percentage(), ' ', Bar(marker=RotatingMarker()), ' ', AdaptiveETA()]
        pbar = ProgressBar(widgets=widgets)
        for c in pbar(candidate):
            check_vec = []
            for ing in c:
                check_vec.append(ing_map[ing])
            for recipe in dat:
                if np.count_nonzero(recipe[check_vec]) == len(check_vec):
                    count[c] += 1
        for item in count.most_common():
            if item[1] > thresh:
                frequent.append(item[0])
        candidate_sets.append(candidate)
        frequent_sets.append(frequent)
        size += 1
        count = Counter()
        candidate = generate_candidate_set(size, frequent)
        frequent = []
        
    return candidate_sets, frequent_sets

In [7]:
c_sets, f_sets = apriori_quick(basketized, df, 0.01)

Counting C2:   0% |                                            | ETA:  --:--:--

[frozenset({'sugar'}), frozenset({'garlic'}), frozenset({'egg'}), frozenset({'onion'}), frozenset({'salt'}), frozenset({'unsalted butter'}), frozenset({'olive oil'}), frozenset({'water'}), frozenset({'all purpose flour'}), frozenset({'lemon juice'}), frozenset({'extra virgin olive oil'}), frozenset({'kosher salt'}), frozenset({'black pepper'}), frozenset({'parsley'}), frozenset({'vegetable oil'}), frozenset({'milk'}), frozenset({'butter'}), frozenset({'vanilla extract'}), frozenset({'tomato'}), frozenset({'thyme'}), frozenset({'ginger'}), frozenset({'cinnamon'}), frozenset({'shallot'}), frozenset({'baking powder'}), frozenset({'heavy cream'}), frozenset({'carrot'}), frozenset({'chicken broth'}), frozenset({'lime juice'}), frozenset({'scallion'}), frozenset({'basil'}), frozenset({'baking soda'}), frozenset({'soy sauce'}), frozenset({'cilantro'}), frozenset({'honey'}), frozenset({'cumin'}), frozenset({'bay'}), frozenset({'dry white wine'}), frozenset({'Dijon mustard'}), frozenset({'orega

Counting C2: 100% ||||||||||||||||||||||||||||||||||||||||||||||| Time: 0:18:21
Counting C3: 100% ||||||||||||||||||||||||||||||||||||||||||||||| Time: 0:08:49
Counting C4: 100% ||||||||||||||||||||||||||||||||||||||||||||||| Time: 0:02:17
Counting C5: 100% ||||||||||||||||||||||||||||||||||||||||||||||| Time: 0:00:30
Counting C6: 100% ||||||||||||||||||||||||||||||||||||||||||||||| Time: 0:00:07
Counting C7: 100% ||||||||||||||||||||||||||||||||||||||||||||||| Time: 0:00:00


In [8]:
for f in f_sets:
    print(f)

[frozenset({'egg', 'sugar'}), frozenset({'onion', 'garlic'}), frozenset({'egg', 'unsalted butter'}), frozenset({'salt', 'egg'}), frozenset({'unsalted butter', 'sugar'}), frozenset({'all purpose flour', 'egg'}), frozenset({'salt', 'sugar'}), frozenset({'all purpose flour', 'unsalted butter'}), frozenset({'olive oil', 'garlic'}), frozenset({'salt', 'unsalted butter'}), frozenset({'all purpose flour', 'salt'}), frozenset({'all purpose flour', 'sugar'}), frozenset({'olive oil', 'onion'}), frozenset({'water', 'sugar'}), frozenset({'water', 'salt'}), frozenset({'vanilla extract', 'sugar'}), frozenset({'egg', 'vanilla extract'}), frozenset({'extra virgin olive oil', 'garlic'}), frozenset({'milk', 'egg'}), frozenset({'baking powder', 'egg'}), frozenset({'salt', 'garlic'}), frozenset({'black pepper', 'salt'}), frozenset({'vanilla extract', 'unsalted butter'}), frozenset({'salt', 'onion'}), frozenset({'parsley', 'garlic'}), frozenset({'all purpose flour', 'baking powder'}), frozenset({'tomato', 

In [9]:
dfs = []
for f in f_sets:
    freq_vec = []
    for s in f:
        insert = list(s)
        insert.sort()
        freq_vec.append(insert)
    new_df = pd.DataFrame(freq_vec, columns=["Item {}".format(x) for x in range(1, len(insert) + 1)])
    dfs.append(new_df)

In [10]:
dfs[0].head(10)

Unnamed: 0,Item 1,Item 2
0,egg,sugar
1,garlic,onion
2,egg,unsalted butter
3,egg,salt
4,sugar,unsalted butter
5,all purpose flour,egg
6,salt,sugar
7,all purpose flour,unsalted butter
8,garlic,olive oil
9,salt,unsalted butter


In [11]:
dfs[1].head(10)

Unnamed: 0,Item 1,Item 2,Item 3
0,egg,sugar,unsalted butter
1,all purpose flour,egg,unsalted butter
2,all purpose flour,egg,sugar
3,egg,salt,sugar
4,all purpose flour,sugar,unsalted butter
5,all purpose flour,egg,salt
6,egg,salt,unsalted butter
7,all purpose flour,salt,unsalted butter
8,all purpose flour,salt,sugar
9,salt,sugar,unsalted butter


In [12]:
dfs[2].head(10)

Unnamed: 0,Item 1,Item 2,Item 3,Item 4
0,all purpose flour,egg,sugar,unsalted butter
1,all purpose flour,egg,salt,unsalted butter
2,all purpose flour,egg,salt,sugar
3,egg,salt,sugar,unsalted butter
4,all purpose flour,salt,sugar,unsalted butter
5,egg,sugar,unsalted butter,vanilla extract
6,all purpose flour,baking powder,egg,sugar
7,all purpose flour,baking powder,egg,unsalted butter
8,all purpose flour,egg,unsalted butter,vanilla extract
9,all purpose flour,egg,sugar,vanilla extract


In [13]:
dfs[3].head(10)

Unnamed: 0,Item 1,Item 2,Item 3,Item 4,Item 5
0,all purpose flour,egg,salt,sugar,unsalted butter
1,all purpose flour,egg,sugar,unsalted butter,vanilla extract
2,all purpose flour,baking powder,egg,sugar,unsalted butter
3,all purpose flour,baking powder,egg,salt,sugar
4,all purpose flour,baking powder,egg,salt,unsalted butter
5,egg,salt,sugar,unsalted butter,vanilla extract
6,baking powder,egg,salt,sugar,unsalted butter
7,all purpose flour,egg,salt,unsalted butter,vanilla extract
8,all purpose flour,egg,salt,sugar,vanilla extract
9,all purpose flour,baking powder,salt,sugar,unsalted butter


In [14]:
dfs[4].head(10)

Unnamed: 0,Item 1,Item 2,Item 3,Item 4,Item 5,Item 6
0,all purpose flour,baking powder,egg,salt,sugar,unsalted butter
1,all purpose flour,egg,salt,sugar,unsalted butter,vanilla extract
2,all purpose flour,baking powder,egg,sugar,unsalted butter,vanilla extract
3,all purpose flour,baking soda,egg,salt,sugar,unsalted butter


In [27]:
def conf_calc(df_in, I, j):
    I_df = df_in[I]
    ij = np.append(I, j)
    Ij_df = df_in[ij]
    I_dat = I_df.values
    Ij_dat = Ij_df.values
    I_sup = 0
    Ij_sup = 0
    for recipe in I_dat:
        if np.count_nonzero(recipe) == len(I):
            I_sup += 1
    for recipe in Ij_dat:
        if np.count_nonzero(recipe) == len(I)+1:
            Ij_sup += 1
    return Ij_sup/I_sup

In [51]:
def interest_calc(df_in, j, conf):
    denom = len(df_in)
    num = 0
    df_j = df_in[j]
    dat_j = df_j.values
    for recipe in dat_j:
        if recipe > 0:
            num += 1
    pj = num/denom
    return conf-pj

In [52]:
def assoc_rules(df_in, df_calc):
    rules = []
    for basket in df_calc.values:
        for i in range(len(basket)):
            I = basket[:]
            j = I[i]
            I = np.delete(I, i)
            conf = conf_calc(df_in, I, j)
            inter = interest_calc(df_in, j, conf)
            rule = I
            rule = np.append(rule, j)
            rule = np.append(rule, conf)
            rule = np.append(rule, inter)
            rules.append(rule)
    columns = ["I[{}]".format(x) for x in range(len(df_calc.values[0])-1)]
    columns.append("j")
    columns.append("Confidence")
    columns.append("Interest")
    return pd.DataFrame(rules, columns=columns)

In [61]:
six_rules = assoc_rules(df, dfs[4])
six_rules = six_rules.sort_values('Confidence', ascending=False)

In [62]:
six_rules.head()

Unnamed: 0,I[0],I[1],I[2],I[3],I[4],j,Confidence,Interest
14,all purpose flour,baking powder,sugar,unsalted butter,vanilla extract,egg,0.933216,0.691845
20,all purpose flour,baking soda,salt,sugar,unsalted butter,egg,0.900735,0.659364
7,all purpose flour,salt,sugar,unsalted butter,vanilla extract,egg,0.86618,0.624809
2,all purpose flour,baking powder,salt,sugar,unsalted butter,egg,0.861397,0.620027
0,baking powder,egg,salt,sugar,unsalted butter,all purpose flour,0.817391,0.661401


In [57]:
five_rules = assoc_rules(df, dfs[3])
five_rules = five_rules.sort_values('Confidence', ascending=False)

In [58]:
five_rules.head()

Unnamed: 0,I[0],I[1],I[2],I[3],j,Confidence,Interest
106,baking powder,salt,unsalted butter,vanilla extract,egg,0.936995,0.695624
81,baking powder,sugar,unsalted butter,vanilla extract,egg,0.934081,0.692711
112,all purpose flour,baking powder,salt,vanilla extract,egg,0.933116,0.691745
96,baking powder,salt,sugar,vanilla extract,egg,0.93239,0.691019
77,all purpose flour,baking powder,unsalted butter,vanilla extract,egg,0.932039,0.690668
