In [2]:
%matplotlib inline

from pathlib import Path

import heapq
from collections import defaultdict

import pandas as pd
import matplotlib.pylab as plt
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

from surprise import dataset, Reader
from surprise.prediction_algorithms import KNNBasic
from surprise.model_selection import train_test_split

In [3]:
DATA = Path('instacart')

In [4]:
order_df = pd.read_csv(DATA / 'order_products__train.csv') 
products_df = pd.read_csv(DATA / 'products.csv') 

In [5]:
print(order_df.shape)
print(products_df.shape)

(1384617, 4)
(49688, 4)


In [6]:
order_df.head(3)

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,1,49302,1,1
1,1,11109,2,1
2,1,10246,3,0


In [7]:
products_df.head(3)

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7


In [10]:
merge_df = pd.merge(order_df,products_df,on='product_id', how='left')
merge_df.shape

(1384617, 7)

In [11]:
merge_df.head(3)

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id
0,1,49302,1,1,Bulgarian Yogurt,120,16
1,1,11109,2,1,Organic 4% Milk Fat Whole Milk Cottage Cheese,108,16
2,1,10246,3,0,Organic Celery Hearts,83,4


In [34]:
merge_df.columns

Index(['order_id', 'product_id', 'add_to_cart_order', 'reordered',
       'product_name', 'aisle_id', 'department_id'],
      dtype='object')

##### we will ignore the order the item was added to the cart since all we care about right now is what is in the cart. 
##### we will ignore product id since we can use product name as the identifier
##### we will ignore aisle id and department id since we only care about purchase associations for this problem. 
##### we will ignore reordered beacuse we were told it did not matter wether the custmoer was going to "...buy it again or recommend trying for the first time ".

In [100]:
ignore = [ 'add_to_cart_order', 'product_id', 'aisle_id', 'reordered','department_id']

In [101]:
merge_df.drop(columns=ignore)

Unnamed: 0,order_id,product_name
0,1,Bulgarian Yogurt
1,1,Organic 4% Milk Fat Whole Milk Cottage Cheese
2,1,Organic Celery Hearts
3,1,Cucumber Kirby
4,1,Lightly Smoked Sardines in Olive Oil
...,...,...
1384612,3421063,Natural Artesian Water
1384613,3421063,Twice Baked Potatoes
1384614,3421070,Organic Unsweetened Almond Milk
1384615,3421070,Creamy Peanut Butter


In [102]:

merge_pivot = pd.pivot_table(merge_df.drop(columns=ignore)[0:100000], index='order_id' , columns= ['product_name'], aggfunc= lambda  x: 1, fill_value=0)
merge_pivot

product_name,#2 Coffee Filters,#4 Natural Brown Coffee Filters,0 Calorie Fuji Apple Pear Water Beverage,0% Fat Black Cherry Greek Yogurt y,0% Fat Blueberry Greek Yogurt,0% Fat Free Organic Milk,0% Fat Organic Greek Vanilla Yogurt,0% Fat Strawberry Greek Yogurt,0% Fat Superfruits Greek Yogurt,0% Greek Strained Yogurt,...,of Hanover 100 Calorie Pretzels Mini,of Norwich Original English Mustard Powder Double Superfine,smartwater® Electrolyte Enhanced Water,vitaminwater® XXX Acai Blueberry Pomegranate,with Bleach Disinfectant Cleanser Scratch Free Lavender Fresh,with Crispy Almonds Cereal,with Dawn Action Pacs Fresh Scent Dishwasher Detergent Pacs,with Olive Oil Mayonnaise Dressing,with Sweet & Smoky BBQ Sauce Cheeseburger Sliders,with Xylitol Minty Sweet Twist 18 Sticks Sugar Free Gum
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
36,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
38,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
96,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
98,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
246741,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
246768,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
246788,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
246796,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [97]:
unpiv = pd.melt(merge_pivot)
unpiv

Unnamed: 0,product_name,reordered,value
0,#2 Coffee Filters,0,0
1,#2 Coffee Filters,0,0
2,#2 Coffee Filters,0,0
3,#2 Coffee Filters,0,0
4,#2 Coffee Filters,0,0
...,...,...,...
224661757,with Xylitol Minty Sweet Twist 18 Sticks Sugar...,1,0
224661758,with Xylitol Minty Sweet Twist 18 Sticks Sugar...,1,0
224661759,with Xylitol Minty Sweet Twist 18 Sticks Sugar...,1,0
224661760,with Xylitol Minty Sweet Twist 18 Sticks Sugar...,1,0


In [104]:
itemsets = apriori(merge_pivot, min_support=0.005, use_colnames=True)
itemsets



Unnamed: 0,support,itemsets
0,0.008441,(100% Raw Coconut Water)
1,0.009708,(100% Recycled Paper Towels)
2,0.014667,(100% Whole Wheat Bread)
3,0.012873,(2% Reduced Fat Milk)
4,0.005065,(2% Reduced Fat Organic Milk)
...,...,...
391,0.006331,"(Organic Strawberries, Organic Zucchini)"
392,0.006120,"(Organic Strawberries, Seedless Red Grapes)"
393,0.005170,"(Raspberries, Strawberries)"
394,0.005698,"(Organic Strawberries, Organic Hass Avocado, B..."


In [113]:
# and convert into rules
rules = association_rules(itemsets, metric='confidence', min_threshold=0.325)
rules.sort_values(by=['lift'], ascending=False).head(10)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
9,"(Organic Strawberries, Organic Hass Avocado)",(Bag of Organic Bananas),0.010024,0.119236,0.005698,0.568421,4.767191,0.004503,2.040795,0.798235
11,"(Bag of Organic Bananas, Organic Raspberries)",(Organic Strawberries),0.015089,0.081882,0.005276,0.34965,4.27015,0.00404,1.411729,0.777549
10,"(Organic Strawberries, Organic Raspberries)",(Bag of Organic Bananas),0.011185,0.119236,0.005276,0.471698,3.956003,0.003942,1.66716,0.755672
0,(Apple Honeycrisp Organic),(Bag of Organic Bananas),0.017516,0.119236,0.006542,0.373494,3.132392,0.004454,1.405835,0.692892
5,(Organic Raspberries),(Bag of Organic Bananas),0.040941,0.119236,0.015089,0.368557,3.090984,0.010207,1.394843,0.705357
1,(Organic D'Anjou Pears),(Bag of Organic Bananas),0.016355,0.119236,0.005698,0.348387,2.921827,0.003748,1.351667,0.668685
3,(Organic Large Extra Fancy Fuji Apple),(Bag of Organic Bananas),0.024586,0.119236,0.008547,0.347639,2.915557,0.005615,1.350118,0.673573
4,(Organic Navel Orange),(Bag of Organic Bananas),0.014878,0.119236,0.00517,0.347518,2.914536,0.003396,1.349866,0.666813
2,(Organic Hass Avocado),(Bag of Organic Bananas),0.053603,0.119236,0.018044,0.336614,2.823091,0.011652,1.32768,0.682355
6,(Cucumber Kirby),(Banana),0.018888,0.141184,0.007175,0.379888,2.690733,0.004509,1.384938,0.640451


### Based on these rules we can see that:
##### Purchasing Organic Strawberries and Organic Hass Avocado increases the likleyhood for the customer to purchase Bag of Organic Bananas by 4.77 times
##### Purchasing Bag of Organic Bananas and Organic Raspberries increases the likleyhood for the customer to purchase  Organic Strawberries by 4.27 times
##### Purchasing Organic Strawberries and Organic Raspberries increases the likleyhood for the customer to purchase Bag of Organic Bananas by 3.96 times
##### Purchasing Apple Honeycrisp Organic increases the likleyhood for the customer to purchase Bag of Organic Bananas by 3.13 times
##### Purchasing Organic Raspberries increases the likleyhood for the customer to purchase Bag of Organic Bananas by 3.09 times
##### Purchasing Organic D'Anjou Pears increases the likleyhood for the customer to purchase Bag of Organic Bananas by 2.92 times
##### Purchasing Organic Large Extra Fancy Fuji Apple increases the likleyhood for the customer to purchase Bag of Organic Bananas by 2.92 times
##### Purchasing Organic Navel Orange increases the likleyhood for the customer to purchase Bag of Organic Bananas by 2.91 times
##### Purchasing Organic Hass Avocado increases the likleyhood for the customer to purchase Bag of Organic Bananas by 2.82 times
##### Purchasing Cucumber Kirby increases the likleyhood for the customer to purchase Banana by 2.69 times

What we can see here is that amongst all the rules that we generated from this dataset, the highest lifts (and confidence of those rows) show us that Bag of Organic Bananas are the most common secondary item purchased if the customer purchases an organic product. 