In [1]:
import pandas as pd
import numpy as np
import sys
from itertools import combinations, groupby
from collections import Counter
from IPython.display import display

In [2]:
def size(obj):
  return "{0:.2f} MB".format(sys.getsizeof(obj)/(1000*1000))

Load text data

In [3]:
orders=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/order_products__prior.csv')
print('orders: \nDimension: {0} \nShape: {1}'.format(orders.shape,size(orders)))

orders: 
Dimension: (32434489, 4) 
Shape: 1037.90 MB


In [14]:
orders.product_id.value_counts().reset_index().iloc[0]

index          24852
product_id    472565
Name: 0, dtype: int64

Converting data into a format resembling association rules

In [15]:
orders=orders.set_index('order_id')['product_id'].rename('item_id')
print(orders.head(10))
type(orders)

order_id
2    33120
2    28985
2     9327
2    45918
2    30035
2    17794
2    40141
2     1819
2    43668
3    33754
Name: item_id, dtype: int64


pandas.core.series.Series

In [None]:
#returns the frequency counts for items and item pairs
def freq(iterable):
  if type(iterable)== pd.core.series,Series:
    return iterable.value_counts().rename('freq')
  else:
    return pd.Series(Counter(iterable)).rename('freq')


#returns number of unique orders
def order_count(order_item):
  return len(set(order_item.index))


#returns generator that yields item pairs, one at a time
def get_item_pairs(order_item):
  order_item=order_item.reset_index().as_matrix()
  for order_id, order_object in groupby(order_item, lambda x: x[0]):
    item_list= [item[1] for item in order_object]

    for item_pair in combinations(item_list,2):
      yield item_pair


#returns frequency and support associated with the item
def merge_item_stats(item_pairs, item_stats):
  return (item_pairs.merge(item_stats.rename(columns={'freq':'freqA', 'support':'supportA'}),left_on='item_A',right_index=True)
  .merge(item_stats.rename(columns={'freq':'freqB', 'support':'supportB'}), left_on='itemB', right_index=True))


#returns name associated with item
def merge_item_name(rules, item_name):
  columns = ['itemA','itemB','freqAB','supportAB','freqA','supportA','freqB','supportB','confidenceAtoB','confidenceBtoA','lift']
  rules=(rules.merge(item_name.rename(columns={'item_name':'itemA'}),left_on='itemA',right_on='item_id')
  .merge(item_name.rename(columns={'item_name':'itemB'}),left_on='item_B', right_on='item_id'))
  return rules[columns]

In [None]:
def association_rules(order_item, min_support):
  print("Starting order_item: {:22d".format(len(order_item)))

  #Calculate item frequency and support
  item_stats = freq(order_item).to_frame("freq")
  item_stats['support']= item_stats['freq']/order_count(order_item)*100

  #Filter from order_item items below min support 
  qualifying_items = item_stats[item_stats['support']>=min_support].index
  order_item = order_item[order_item.isin(qualifying_items)]

  print("Items with support >= {}: {:15d}".format(min_support, len(qualifying_items)))
  print("Remaiining order_item: {:21d}".format(len(order_item)))

  #filter from order_item orders with less than 2 items
  order_size = freq(order_item.index)
  qualifying_orders = order_size[order_size>=2].index
  order_item = order_item[order_item.index.isin(qualifying_orders)]

  print("Remaining orders with 2+ items: {:11d}".format(len(qualifying_orders)))
  print("Remaining order_item: {:21d}".format(len(order_item)))

  #recalculate item freq and support
  item_stats= freq(order_item).to_frame('freq')
  item_stats['support']=item_stats['freq']/order_count(order_item) *100

  #iter pair generator
  item_pair_gen=get_item_pairs(order_item)
  
  #recalculate
  item_pairs=freq(item_pair_gen).to_frame('freqAB')
  item_pairs['supportAB']=item_pairs['freqAB']/len(qualifying_orders)*100

  print("Item pairs: {:31d}".format(len(item_pairs)))


  # Filter from item_pairs those below min support
  item_pairs              = item_pairs[item_pairs['supportAB'] >= min_support]

  print("Item pairs with support >= {}: {:10d}\n".format(min_support, len(item_pairs)))


  # Create table of association rules and compute relevant metrics
  item_pairs = item_pairs.reset_index().rename(columns={'level_0': 'item_A', 'level_1': 'item_B'})
  item_pairs = merge_item_stats(item_pairs, item_stats)
   
  item_pairs['confidenceAtoB'] = item_pairs['supportAB'] / item_pairs['supportA']
  item_pairs['confidenceBtoA'] = item_pairs['supportAB'] / item_pairs['supportB']
  item_pairs['lift']           = item_pairs['supportAB'] / (item_pairs['supportA'] * item_pairs['supportB'])
    
    
  # Return association rules sorted by lift in descending order
  return item_pairs.sort_values('lift', ascending=False)

In [None]:
rules = association_rules(orders, 0.01)