In [2]:
import pandas as pd
from tqdm import tqdm
import numpy as np

In [3]:
data_sales = pd.read_csv('sales__metadata-2023.csv')
clic_sales = pd.read_csv('user_events__click_sale-2023.csv')

In [4]:
def creating_negative_interactions(user_events: pd.DataFrame):
    """
    The aim of this function is to create negative samples from a dataset of interactions between users and sales.
    """

    # Get a list of all sales
    all_saleIds = clic_sales['sale_id'].unique()

    # Placeholders that will hold the training data
    users, sales, labels = [], [], []

    # This is the set of sales that each user has interaction with
    user_sale_set = set(zip(clic_sales['user_id'], clic_sales['sale_id']))

    # 1:1 ratio of negative to positive samples
    num_negatives = 1

    for (u, i) in tqdm(user_sale_set):
        users.append(u)
        sales.append(i)
        labels.append(1) # items that the user has interacted with are positive
        for _ in range(num_negatives):
            # randomly select an item
            negative_item = np.random.choice(all_saleIds) 
            # check that the user has not interacted with this item
            while (u, negative_item) in user_sale_set:
                negative_item = np.random.choice(all_saleIds)
            users.append(u)
            sales.append(negative_item)
            labels.append(0)

    augmented_interactions = pd.DataFrame({'user_id':users, 'sale_id': sales, 'interaction': labels})

    return augmented_interactions

In [5]:
augmented_data = creating_negative_interactions(clic_sales)

100%|██████████| 7920106/7920106 [01:16<00:00, 103827.58it/s]


In [6]:
augmented_data

Unnamed: 0,user_id,sale_id,interaction
0,dCDOUoZdh0fP4pf7GAIUyrMy7D22,f2e175b91ffc4c0794e8cbaff,1
1,dCDOUoZdh0fP4pf7GAIUyrMy7D22,5d5391f18b914449a59001ae3,0
2,ihIT39VE9RTJhVBz5LZzPqxHe5X2,5f7ef8dd863a48c5907688eac,1
3,ihIT39VE9RTJhVBz5LZzPqxHe5X2,1b050e3c1bea4521a1cab5b04,0
4,o5T2bfIdandbpKJ0IZDz9vA04XH3,11be041fc71046348b1ae7c15,1
...,...,...,...
15840207,9UA0EBxNfcYHAWjvucpS2ZzqzFm1,43775c67647c4caeac9e9b058,0
15840208,8gudYcFDReP06ceSDr7lpo5kCiy2,dacba29e64b849189066c3733,1
15840209,8gudYcFDReP06ceSDr7lpo5kCiy2,811b885029d2441f8c2579438,0
15840210,jiWWW32n4lUTh3TC975MQsf6IqM2,f8e4d76f222443b6a05c57928,1
