# Interaction Data Generation
Using the transactions data is not enough. This is because the transactions data only provide information about which items the customers were buying. However, to train a recommender system, we need training samples that indicates that the customer interacts with the item (positive signal) and also samples that indicates otherwise (negative signal).

### Setup

In [1]:
import random
from pathlib import Path

import numpy as np
import pandas as pd

# Setup path to data folder
data_path = Path('../data/')
processed_data_path = data_path / 'processed'

  from .autonotebook import tqdm as notebook_tqdm


### Data Sampling
The transactions dataset contains ~30 million rows. It will take a very long time to train our model with this many data. For demonstration purpose, let's start with transactions from 1000 customers only.

In [None]:
CUSTOMER_DATASET_SIZE = 1000
RANDOM_SEED = 42

transactions_df = pd.read_pickle(processed_data_path / 'transactions.pkl')
customers_df = pd.read_pickle(processed_data_path / 'customers.pkl')

random.seed(RANDOM_SEED)

print(
    f'Sampling {CUSTOMER_DATASET_SIZE} customers from {len(customers_df)} total customers.'
)
customers_df = customers_df.sample(n=CUSTOMER_DATASET_SIZE, random_state=RANDOM_SEED)

print(f'Number of transactions for all customers: {len(transactions_df)}')
transactions_df = pd.merge(
    transactions_df, customers_df[['customer_id']], how='inner', on='customer_id'
)
print(
    f'Number of transactions for the {CUSTOMER_DATASET_SIZE} sampled customers: {len(transactions_df)}'
)

Sampling 1000 customers from 1356119 total customers.
Number of transactions for all customers: 31788324
Number of transactions for the 1000 sampled customers: 24341


Before we proceed, let's save the sampled `customers_df` and `transactions_df` as pickles.

In [4]:
customers_df.to_pickle(data_path / 'processed' / 'sampled_customers.pkl')
transactions_df.to_pickle(data_path / 'processed' / 'sampled_transactions.pkl')

### Feature Engineering

In [2]:
# Load sampled data
sampled_transactions_df: pd.DataFrame = pd.read_pickle(
    data_path / 'processed' / 'sampled_transactions.pkl'
)
sampled_customers_df: pd.DataFrame = pd.read_pickle(
    data_path / 'processed' / 'sampled_customers.pkl'
)

print(
    f"Number of unique sampled customers: {len(sampled_customers_df['customer_id'].unique())}"
)
print(
    f'Number of transactions from the sampled customers: {len(sampled_transactions_df)}'
)

Number of unique sampled customers: 1000
Number of transactions from the sampled customers: 24341


In [3]:
all_articles = sampled_transactions_df['article_id'].unique().tolist()
len(all_articles)

14013

In [13]:
MIN_IGNORES = 40
MAX_IGNORES = 60
CLICK_BEFORE_PURCHASE_PROB = 0.9
EXTRA_CLICKS_PROB = 0.95
MIN_EXTRA_CLICKS = 5
MAX_EXTRA_CLICKS = 15


def generate_timestamps(base_timestamp, count, min_hours, max_hours) -> np.ndarray:
    hours = np.random.randint(min_hours, max_hours, size=count)
    return base_timestamp - (hours * 3600000)


for customer_id in sampled_customers_df['customer_id'].tolist():
    customer_purchases = sampled_transactions_df[
        sampled_transactions_df['customer_id'] == customer_id
    ]

    if len(customer_purchases) == 0:
        continue

    customer_articles = {'purchased': set(), 'clicked': set(), 'ignored': set()}
    last_purchase_timestamp = customer_purchases['t_dat'].max()
    interactions = []

    # First, we generate multiple ignores
    num_ignores = np.random.randint(MIN_IGNORES, MAX_IGNORES)
    if all_articles and num_ignores > 0:
        ignore_timestamps = generate_timestamps(
            base_timestamp=last_purchase_timestamp,
            count=num_ignores,
            min_hours=1,
            max_hours=96,
        ).tolist()
        selected_ignores = np.random.choice(
            all_articles, size=min(num_ignores, len(all_articles)), replace=False
        ).tolist()

        # Generate multiple sets of ignores to increase the count
        for timestamp, article_id in zip(ignore_timestamps, selected_ignores):
            # Add 1-2 ignore events for the same article
            num_ignore_events = np.random.randint(1, 3)
            for _ in range(num_ignore_events):
                # Add some hours difference
                ignore_timestamp = timestamp - np.random.randint(1, 12) * 3600000
                interactions.append(
                    {
                        't_dat': ignore_timestamp,
                        'customer_id': customer_id,
                        'article_id': article_id,
                        'interaction_score': 0,
                        'prev_article_id': None,
                    }
                )
            customer_articles['ignored'].add(article_id)

    # Second, we process the purchases and their clicks
    for row in customer_purchases.iterrows():
        # See https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.iterrows.html
        # on why we need [1] to obtain the data.
        purchase_timestamp = row[1]['t_dat']
        article_id = row[1]['article_id']

        # Add clicks before purchase
        if np.random.random() < CLICK_BEFORE_PURCHASE_PROB:
            num_pre_clicks = np.random.randint(1, 3)
            for _ in range(num_pre_clicks):
                click_timestamp = generate_timestamps(
                    base_timestamp=purchase_timestamp,
                    count=1,
                    min_hours=1,
                    max_hours=48,
                )[0]
                interactions.append(
                    {
                        't_dat': click_timestamp,
                        'customer_id': customer_id,
                        'article_id': article_id,
                        'interaction_score': 1,
                        'prev_article_id': None,
                    }
                )
                customer_articles['clicked'].add(article_id)

        # Add purchase
        interactions.append(
            {
                't_dat': purchase_timestamp,
                'customer_id': customer_id,
                'article_id': article_id,
                'interaction_score': 2,
                'prev_article_id': None,
            }
        )
        customer_articles['purchased'].add(article_id)

    # Third, we generate extra clicks
    if np.random.random() < EXTRA_CLICKS_PROB:
        num_extra_clicks = np.random.randint(MIN_EXTRA_CLICKS, MAX_EXTRA_CLICKS)
        available_for_clicks = list(
            set(all_articles)
            - customer_articles['purchased']
            - customer_articles['clicked']
            - customer_articles['ignored']
        )

        if available_for_clicks and num_extra_clicks > 0:
            for _ in range(num_extra_clicks):
                click_timestamps = generate_timestamps(
                    base_timestamp=last_purchase_timestamp,
                    count=num_extra_clicks,
                    min_hours=1,
                    max_hours=72,
                )
                selected_clicks = np.random.choice(
                    available_for_clicks,
                    size=min(num_extra_clicks, len(available_for_clicks)),
                    replace=False,
                ).tolist()

                for timestamp, article_id in zip(click_timestamps, selected_clicks):
                    interactions.append(
                        {
                            't_dat': timestamp,
                            'customer_id': customer_id,
                            'article_id': article_id,
                            'interaction_score': 1,
                            'prev_article_id': None,
                        }
                    )

In [16]:
interactions_df = pd.DataFrame(interactions)
sorted_df = interactions_df.sort_values(['customer_id', 't_dat'])

final_df = sorted_df.assign(
    prev_article_id=sorted_df.groupby('customer_id')['article_id']
    .shift(1)
    .fillna('START')
)

In [19]:
final_df.head()

Unnamed: 0,t_dat,customer_id,article_id,interaction_score,prev_article_id
31,1579885200000,6d6e91ea618beec6fa2843d5fbf25a9164c6318abdd715...,455014028,0,START
23,1579888800000,6d6e91ea618beec6fa2843d5fbf25a9164c6318abdd715...,834002002,0,455014028
39,1579888800000,6d6e91ea618beec6fa2843d5fbf25a9164c6318abdd715...,788115001,0,834002002
68,1579896000000,6d6e91ea618beec6fa2843d5fbf25a9164c6318abdd715...,579302009,0,788115001
24,1579906800000,6d6e91ea618beec6fa2843d5fbf25a9164c6318abdd715...,834002002,0,579302009


### Save the Engineered Features
Lastly, we pickle it to save the generated interactions data

In [20]:
final_df.to_pickle(data_path / 'processed' / 'interactions.pkl')