In [1]:
import pandas as pd
from tqdm import tqdm
import numpy as np

In [2]:
data_sales = pd.read_csv('sales__metadata-2023.csv')
clic_sales = pd.read_csv('user_events__click_sale-2023.csv')

In [51]:
def creating_negative_interactions(user_events: pd.DataFrame):
    """
    The aim of this function is to create negative samples from a dataset of interactions between users and sales.
    """

    # Get a list of all sales
    all_saleIds = user_events['sale_id'].unique()

    # Placeholders that will hold the training data
    users, sales, labels = [], [], []

    # This is the set of sales that each user has interaction with
    user_sale_set = set(zip(user_events['user_id'], user_events['sale_id']))

    # 1:1 ratio of negative to positive samples
    num_negatives = 1

    for (u, i) in tqdm(user_sale_set):
        users.append(u)
        sales.append(i)
        labels.append(1) # items that the user has interacted with are positive
        for _ in range(num_negatives):
            # randomly select an item
            negative_item = np.random.choice(all_saleIds) 
            # check that the user has not interacted with this item
            while (u, negative_item) in user_sale_set:
                negative_item = np.random.choice(all_saleIds)
            users.append(u)
            sales.append(negative_item)
            labels.append(0)

    augmented_interactions = pd.DataFrame({'user_id':users, 'sale_id': sales, 'interaction': labels})

    return augmented_interactions

In [7]:
def add_features(interaction: pd.DataFrame, sale_info: pd.DataFrame = None, user_info: pd.DataFrame = None):
    """
    The aim of this function is to add the user and sale features to the dataset.
    """
    
    featured_data = interaction.merge(sale_info, on='sale_id', how ='left')
    if user_info!=None:
        featured_data = featured_data.merge(user_info, on='user_id', how = 'left')
    
    return featured_data

In [8]:
augmented_data = creating_negative_interactions(clic_sales)

100%|██████████| 7920106/7920106 [01:21<00:00, 96750.56it/s] 


In [9]:
scored_data = add_features(augmented_data,data_sales)

In [None]:
clic_sales

Unnamed: 0,user_id,event_date,server_time,event_name,sale_id,status,platform
0,TsX5qcCf3WO8qLGxhddfRqPFLks1,2023-02-08,2023-02-08 00:00:10.178052 UTC,ClickSale,a929100fdb9e45459a9782762,ongoing,ios
1,dEyVggd7NZR2W3qtrVKdT0ECL8E2,2023-02-08,2023-02-08 00:00:25.810047 UTC,ClickSale,db7518ee5c624fbeb8e5ddc9e,ongoing,ios
2,FVjBUoHogzg6fvaQYqgdC2pCAA83,2023-02-08,2023-02-08 00:00:36.469548 UTC,ClickSale,b71bbbee4a864b609656bba5d,ongoing,ios
3,dEyVggd7NZR2W3qtrVKdT0ECL8E2,2023-02-08,2023-02-08 00:00:37.663173 UTC,ClickSale,5e72119da26b4ae687627520d,ongoing,ios
4,s8FDEtzv2EfLNtHM2oGJGNJcJ5K2,2023-02-08,2023-02-08 00:00:39.916514 UTC,ClickSale,a929100fdb9e45459a9782762,ongoing,android
...,...,...,...,...,...,...,...
9082237,6L8mi9NSyrVevHjwlcgEwztYdK82,2023-01-22,2023-01-22 23:59:13.399079 UTC,ClickSale,70bdbc0938514dada1bb08a18,ongoing,ios
9082238,Z9LiLDV1NwNOCQjwBR5uqE5nIlq1,2023-01-22,2023-01-22 23:59:13.559368 UTC,ClickSale,15755123dc6d4f7284aa22275,ongoing,ios
9082239,WB7u08ayyQXTeolVYOIKiDxj0bc2,2023-01-22,2023-01-22 23:59:31.990139 UTC,ClickSale,10bd38dac8504b0e99c2a4d23,ongoing,ios
9082240,p3vtQMxxtcbvedhtZBPIjypINW62,2023-01-22,2023-01-22 23:59:43.490519 UTC,ClickSale,6e9cf2fe52e84e9e9159ca0e0,ongoing,ios


In [None]:
scored_data.to_csv('scored_data.csv')

In [50]:
all_saleIds_date = clic_sales[['sale_id','event_date']].drop_duplicates(subset = 'sale_id')
user_events = all_saleIds_date[all_saleIds_date['event_date']== '2023-02-08'].sample()[['sale_id','event_date']]
print(user_events['sale_id'].tolist()[0], user_events['event_date'].tolist()[0])

d8dd800fdf794235a8851647e 2023-02-08


In [35]:
all_saleIds = user_events['sale_id'].unique()
negative_item = np.random.choice(all_saleIds)
negative_item

'f189b226da384e008b5efbd5b'

In [52]:
def creating_negative_interactions(user_events: pd.DataFrame):
    """
    The aim of this function is to create negative samples from a dataset of interactions between users and sales.
    """

    # Get a list of all sales
    all_saleIds = clic_sales[['sale_id','start_date']].drop_duplicates(subset = 'sale_id')

    # Placeholders that will hold the training data
    users, sales, date, labels = [], [], [], []

    # This is the set of sales that each user has interaction with
    user_sale_set = set(zip(user_events['user_id'], user_events['sale_id'], user_events['start_date']))

    # 1:1 ratio of negative to positive samples
    num_negatives = 1

    for (u, i, d) in tqdm(user_sale_set):
        users.append(u)
        sales.append(i)
        date.append(d)
        labels.append(1) # items that the user has interacted with are positive
        for _ in range(num_negatives):
            # randomly select an item
            negative_item = all_saleIds[all_saleIds['start_date']== str(d)].sample()[['sale_id','start_date']]
            # check that the user has not interacted with this item
            while (u, negative_item['sale_id'].tolist()[0], negative_item['start_date'].tolist()[0]) in user_sale_set:
                negative_item = all_saleIds[all_saleIds['start_date']== str(d)].sample()[['sale_id','start_date']]
            users.append(u)
            sales.append(negative_item['sale_id'].tolist()[0])
            date.append(negative_item['start_date'].tolist()[0])
            labels.append(0)

    augmented_interactions = pd.DataFrame({'user_id':users, 'sale_id': sales, 'start_date': date, 'interaction': labels})

    return augmented_interactions