In [1]:
import pandas as pd
from collections import Counter
from itertools import combinations
from collections import defaultdict
from datetime import datetime

---
### Goal:

The goal of this notebook is to implement association rules (a.k.a. seeded recommendations) to the data exported from the events log of the practical remmender systems book. See chapter 5 of the book, in particular listing 5.4.3 in page 120.

---
### Import Data:

In [2]:
events_df = pd.read_csv('data/collector_log.csv')
events_df.head(5)

Unnamed: 0,id,created,user_id,content_id,event,session_id
0,1,2022-04-26 10:44:50-04,400003,4501244,details,794773
1,2,2022-04-26 10:44:50-04,400003,3521164,moreDetails,794773
2,3,2022-04-26 10:44:50-04,400005,3640424,details,441002
3,4,2022-04-26 10:44:50-04,400001,2823054,moreDetails,885440
4,5,2022-04-26 10:44:50-04,400005,3553976,genreView,441003


---
### Step 1 - Set minimum Support and Confidence thresholds:

In [3]:
min_support = 0.01

---
### Step 2 - Get all Transactions:

In [4]:
# first, we need to filter by "buy" events:
transactions_df = events_df[events_df['event']=='buy'].sort_values(['session_id', 'content_id'])
transactions_df.head(10)

Unnamed: 0,id,created,user_id,content_id,event,session_id
707,708,2022-04-26 10:44:51-04,400006,5512872,buy,42460
804,805,2022-04-26 10:44:51-04,400006,3783958,buy,42462
2067,2068,2022-04-26 10:44:52-04,400006,1473832,buy,42484
2081,2082,2022-04-26 10:44:52-04,400006,1700841,buy,42485
2321,2322,2022-04-26 10:44:52-04,400006,2277860,buy,42487
3211,3212,2022-04-26 10:44:53-04,400006,4034354,buy,42499
3503,3504,2022-04-26 10:44:54-04,400006,1292566,buy,42500
3713,3714,2022-04-26 10:44:54-04,400006,1292566,buy,42509
4444,4445,2022-04-26 10:44:55-04,400006,5247022,buy,42516
5177,5178,2022-04-26 10:44:55-04,400006,2387499,buy,42519


In [5]:
# second, we need to append all content purchased on the same transaction (session_id) to the originating transaction id as a dictionary:
transactions_dict = transactions_df.groupby('session_id')['content_id'].apply(list).to_dict()
# transactions_dict

---
### Step 3 - Create a list of itemsets (single item):

**Note:** in this function we begin looking at the items individually and use the support threshold to decide whether or not to keep them and use them further to create recommendation rules. In essence, this fucntion is the first step in filtering and narrowing down the universe of items for which we will calculate rules (see page 122 of book).

In [6]:
def get_one_item_itemset(transactions_dict, min_support):
    # define dictionaries.
    # keeps track of item rolling counts
    item_counter = Counter()
    # keeps track of items that meet the threshold requirements:
    one_item_dict = dict()
    
    # calculate support threshold:
    support_thresh = len(transactions_dict)*min_support
    
    # for each transaction, look each item and keep a running count in a dict:
    for transaction_id, items_list in transactions_dict.items():
        for items in items_list:
            item_counter[items] += 1

    # for each item in the item count dict, check the count number and see if it's above the threshold:
    for item, item_count in item_counter.items():
        if item_count > support_thresh:
            one_item_dict[item] = item_count
            
    return one_item_dict

In [7]:
# call the function to return the dictionary:
one_item_itemset_dict = get_one_item_itemset(transactions_dict, min_support)
# one_item_itemset_dict

---
### Step 4 - Create a list of itemsets (two or more items):
Note: in this function we look at all the combinations of two or more items that meet the support threshold derived from the single item itemset. From here we create a dictionary containing each combination along with their frequency (see page 122 of book).

In [8]:
def get_two_plus_items_itemset(transactions_dict, one_itemset_dict, min_support):
    # keeps track of item rolling counts
    two_item_counter = Counter()
    
    # iterate through all transactions and obtain a list of their "purchased" items:
    for transaction_id, item_list in transactions_dict.items():
        # remove dupes:
        items = list(set(item_list))
        
        # if list of items is greater than two, then obtain number of combinations of two items:
        if (len(items) > 2):
            # check each combination of two items to ensure both items are in the one_itemset support dictionary:
            for combo in combinations(items, 2):
                if set(list(combo)).issubset(list(one_itemset_dict)):
                    # if so, append to two_item dictionary:
                    two_item_counter[combo] += 1
        
        # if list is of length 2, then check if both items are in the one_itemset dictionary:
        elif (len(items) == 2):
            # check each combination of two items to ensure both items are in the one_itemset support dictionary:
            if set(items).issubset(list(one_itemset_dict)):
                # if so, append to two_item dictionary:
                two_item_counter[tuple(items)] += 1

    # keeps track of all combination of two items that meet the requirement:
    two_item_dict = dict(two_item_counter)
    
    return two_item_dict
                

In [9]:
# call the function to return the dictionary:
two_item_itemset_dict = get_two_plus_items_itemset(transactions_dict, one_item_itemset_dict, min_support)
# two_item_itemset_dict

---
### Step 5 - Calculate Association Rules:

In [10]:
def calculate_association_rules(one_item_itemset, two_item_itemset, transactions_dict):
    # get total number of transactions:
    total_transactions = len(transactions_dict)
    
    # define list to save derived rules:
    rules = []
    
    # obtain calculatio timestamp:
    timestamp = datetime.now()
    
    # iterate through each item in the one_itemset_dict:
    for source, source_freq in one_item_itemset.items():
        # for each item in the one_itemset_dict, iterate through the two_item_itemset dict:
        for group_key, group_freq in two_item_itemset.items():
            # check to see if "source" item (i.e. the item we are recommending against) is a subset of the group in the two_item tuple:
            if source in group_key:
                # if it is, we take the remaining item as the target:
                target = set(group_key).difference([source]).pop()
                # calculate support (number of times target item occurs along with source item, divided by the total number of transactions)
                support = group_freq / total_transactions
                # calculate confidence (number of times item item occurs along with source item, compared to how often the source item occurs by itself)
                # i.e. how likely it is that the target item will be found or "purchased" whenever the source item is purchased:
                confidence = group_freq / source_freq
                # lift = group_freq / (source_freq*group_freq)
                
                # append results to rules list as tuple:
                rules.append((timestamp, source, target, support, confidence))
    
    return rules

In [11]:
# call function and get rules:
calculate_association_rules(one_item_itemset_dict, two_item_itemset_dict, transactions_dict)

[(datetime.datetime(2022, 5, 5, 5, 48, 39, 730983),
  5512872,
  3874544,
  0.0011037527593818985,
  0.03333333333333333),
 (datetime.datetime(2022, 5, 5, 5, 48, 39, 730983),
  5512872,
  3783958,
  0.0011037527593818985,
  0.03333333333333333),
 (datetime.datetime(2022, 5, 5, 5, 48, 39, 730983),
  5512872,
  2869728,
  0.0011037527593818985,
  0.03333333333333333),
 (datetime.datetime(2022, 5, 5, 5, 48, 39, 730983),
  5512872,
  4513674,
  0.0011037527593818985,
  0.03333333333333333),
 (datetime.datetime(2022, 5, 5, 5, 48, 39, 730983),
  5512872,
  3315342,
  0.0011037527593818985,
  0.03333333333333333),
 (datetime.datetime(2022, 5, 5, 5, 48, 39, 730983),
  5512872,
  1878870,
  0.0011037527593818985,
  0.03333333333333333),
 (datetime.datetime(2022, 5, 5, 5, 48, 39, 730983),
  3783958,
  5512872,
  0.0011037527593818985,
  0.05263157894736842),
 (datetime.datetime(2022, 5, 5, 5, 48, 39, 730983),
  3783958,
  4572514,
  0.0011037527593818985,
  0.05263157894736842),
 (datetime.datet