# Negative inetraction
The aim of this file is to create negative interactions to train the model. The way it is done in this file is by creating one negative interaction per positive one on a sale that came out on the same day as the one that the user ordered from. In this way we create a balanced dataset and create interactions on sale that the user has most likely seen and not bought from.

In [273]:
import pandas as pd
from tqdm import tqdm
import numpy as np

import unidecode

In [297]:
raw_data_sales = pd.read_json('sales_mai22_mai23.json')
raw_orders = pd.read_json('orders_juil22_mai23.json')
raw_users = pd.read_json('user_info.json')
first_orders = pd.read_json('first_order_date.json')

In [143]:
def event_data_formating(user_events: pd.DataFrame):
    user_events = user_events.rename(columns={'Sales__start_at': 'start_at'})
    user_events['order_date'] = pd.to_datetime(user_events['order_date'])
    user_events['start_at'] = pd.to_datetime(user_events['start_at'])
    return user_events

In [144]:
def sale_data_formating(sale_info: pd.DataFrame):
    
    sale_info['start_at'] = pd.to_datetime(sale_info['start_at'])

    return sale_info

In [145]:
USER_CATEG_COLUMNS = ['category_1', 'category_2', 'category_3']

# Liste de catégories
sale_categories = ['sans_categorie','Accessoires', 'Beauté', 'Bibliothèque',
                     'Bien-Être', 'Bijoux', 'Buanderie', 'Chambre',
                     'Chaussant', 'Chaussures', 'Cuisine', 'Cures',
                     'Expériences', 'Hygiène', 'Kids', 'Lingerie',
                     'Maroquinerie', 'Outdoor', 'Prêt-à-porter',
                     'Salon', 'Soins', 'Sportswear']

# Création du mapping
mapping_cat = {category: index + 1 for index, category in enumerate(sale_categories)}

def user_data_formating(user_info: pd.DataFrame):
    df = user_info.copy(deep = True)
    for category in USER_CATEG_COLUMNS:
        df[category] = [mapping_cat.get(cat, 0) for cat in df[category]]
    return df

In [147]:
order_info = event_data_formating(raw_orders)
sale_info = sale_data_formating(raw_data_sales)
user_info = user_data_formating(raw_users)

In [148]:
order_info

Unnamed: 0,user_key,order_date,sale_id,start_at
0,XeqPxEqfbVYqZMi7zeOhQjyr0KF2,2022-08-01,db29d78ce4e34294b2a34238d,2022-08-01
1,h14WH8h2zaQvOoL3JWv4FcBvMc53,2022-08-01,463980e72ed841eba8946a3ad,2022-07-22
2,V9E3NUviw4QMkUmrlFMn4CuRzVz1,2022-08-01,838df3e0de5c49d2a401a53c9,2022-07-28
3,kuY6xEOn31Rm1QrzI8rapx8v6mU2,2022-08-01,838df3e0de5c49d2a401a53c9,2022-07-28
4,AFoNziXyCKRqug6y0rKE8LNn1Fc2,2022-08-01,bb45725866464027a38deb531,2022-07-29
...,...,...,...,...
792395,rzxiFbQW9ZbwU8zrRJIh0HqpxAJ3,2023-05-31,6ea0b2ab975f4facabfb92b36,2023-05-30
792396,gQHHIJluiUdoMvFwFhu3IlOzRxs2,2023-05-31,b182a15acf03490a80a123fe8,2023-05-24
792397,6edr33XbPHcf2lUI0ULhOZ7DOo02,2023-05-31,c4f32b5d2b22412486f2cd146,2023-05-30
792398,9YmYEvmqqRMqKHgnZJ7L18WDOch1,2023-05-31,d8dd6efdf48a4abd81d223186,2023-05-20


In [149]:
def test_train_split(orders: pd.DataFrame, nb_month_of_test_set: int):
    """
    The aim of this function is to split the data into test and train on the order dates
    using a given number of months for the test set.
    """

    last_event_date = max(orders['order_date'])
    split_date = last_event_date - pd.DateOffset(months = nb_month_of_test_set)

    test_data = orders[orders['order_date']>=split_date]
    train_data = orders[orders['order_date']<split_date]

    return (test_data, train_data)

In [150]:
def create_delta(events: pd.DataFrame):
    """
    Adds a column to calculate delta in hours between the sale opening and 
    the order date.
    """
    df_events = events.copy(deep=True)
    df_events['delta_heures'] = (df_events['order_date'] - df_events['start_at']).dt.total_seconds() / 3600
    df_events = df_events.drop(columns=['order_date'])

    return df_events


In [151]:
test_orders, train_orders = test_train_split(order_info, 1)
test_orders = create_delta(test_orders)
train_orders = create_delta(train_orders)

In [152]:
def generate_random_sale_with_same_date(all_saleIds, d):
    if len(all_saleIds[all_saleIds['start_at']== d])>=1:
        negative_item = all_saleIds[all_saleIds['start_at']== d].sample(n=1)
    else:
        negative_item = all_saleIds.sample(n=1)

    return (negative_item['sale_id'].tolist()[0])

In [153]:
def creating_negative_interactions(user_events: pd.DataFrame):
    """
    The aim of this function is to create negative samples from a dataset of interactions between users and sales.
    """
    # Get a list of all sales
    all_saleIds = user_events[['sale_id','start_at']].drop_duplicates(subset = 'sale_id')
    all_saleIds = all_saleIds.merge(sale_info['sale_id'], how='inner')
    

    # Placeholders that will hold the training data
    users, sales, labels, delta = [], [], [], []

    # This is the set of sales that each user has interaction with
    user_sale_set = set(zip(user_events['user_key'], user_events['sale_id'], user_events['start_at'], user_events['delta_heures']))
    # 1:1 ratio of negative to positive samples
    num_negatives = 1

    for (u, i, s, d) in tqdm(user_sale_set):
        users.append(u)
        sales.append(i)
        delta.append(d)
        labels.append(1) # items that the user has interacted with are positive
        for _ in range(num_negatives):
            # randomly select an item
            negative_item = generate_random_sale_with_same_date(all_saleIds, s)
            # check that the user has not interacted with this item
            while (u, negative_item) in user_sale_set:
                negative_item = generate_random_sale_with_same_date(all_saleIds, s)
            users.append(u)
            sales.append(negative_item)
            labels.append(0)
            delta.append(d)

    augmented_interactions = pd.DataFrame({'user_key': users, 'sale_id': sales, 'delta': delta, 'interaction': labels})

    return augmented_interactions

In [154]:
augmented_test_data = creating_negative_interactions(test_orders)
augmented_train_data = creating_negative_interactions(train_orders)

100%|██████████| 96843/96843 [01:12<00:00, 1334.97it/s]
100%|██████████| 681795/681795 [07:20<00:00, 1546.99it/s]


In [155]:
augmented_test_data

Unnamed: 0,user_key,sale_id,delta,interaction
0,Y21HGXBdBCYkVG0QI6u1tNLnSov2,95d7d2425ee94f47bac4ba617,96.0,1
1,Y21HGXBdBCYkVG0QI6u1tNLnSov2,b60d7bec80fe4e079145f8418,96.0,0
2,8fZsHUydVTQpOhZWqnZGyJ88kvn2,5a3a73c848114f5da57953022,312.0,1
3,8fZsHUydVTQpOhZWqnZGyJ88kvn2,80849457b6db4afdada8c2a17,312.0,0
4,UbEd1lIoqQaroAOWEr9ny8St5Uh1,4e83dc6ad06e49fc81012c678,72.0,1
...,...,...,...,...
193681,362O8DF8eTRRdM5C2sMVxJQwK1F3,a1bf3ce880d34fe683b7989ae,144.0,0
193682,IeTDE5Bb0wewMStQHCCt5MjGjZK2,9e258faf88094c30ad36eb305,216.0,1
193683,IeTDE5Bb0wewMStQHCCt5MjGjZK2,6bf90390e09c4955b8c7278e0,216.0,0
193684,dIWTR6GlnPUsxCPAltV4oY8Xa8M2,06394f3ae48b4a7ab5d410ee2,96.0,1


In [156]:
print(len(augmented_test_data[augmented_train_data['interaction']==1]), len(augmented_test_data[augmented_train_data['interaction']==0]))


96843 96843


  print(len(augmented_test_data[augmented_train_data['interaction']==1]), len(augmented_test_data[augmented_train_data['interaction']==0]))


In [157]:
def add_features(interaction: pd.DataFrame, sale_info: pd.DataFrame = None, user_info: pd.DataFrame = None):
    """
    The aim of this function is to add the user and sale features to the dataset.
    """
    print(len(interaction))
    featured_data = interaction.merge(sale_info, on='sale_id', how ='left')
    print(len(featured_data))
    featured_data = featured_data.merge(user_info, on='user_key', how='left')
    print(len(featured_data))
    
    return  featured_data.dropna()

In [302]:
def include_cold_start(data: pd.DataFrame, first_order_date: pd.DataFrame):
    df = data.copy(deep=True)

    df['start_at'] = pd.to_datetime(df['start_at'])
    first_order_date['min_start_date'] = pd.to_datetime(first_order_date['min_start_date'])
    # Create a boolean mask for the rows that satisfy the condition
    merged_df = df.merge(first_order_date, left_on=['user_key', 'start_at'], right_on=['user_key', 'min_start_date'], how='left')

    # Update the values for the matching rows
    merged_df.loc[merged_df['min_start_date'].notnull(), ['monetary', 'recency', 'frequency']] = 0
    merged_df.loc[merged_df['min_start_date'].notnull(), ['category_1', 'category_2', 'category_3']] = 0

    # Drop the redundant columns
    merged_df.drop(['min_start_date'], axis=1, inplace=True)

    return merged_df
    

In [303]:
scored_test_data = add_features(augmented_test_data, sale_info, user_info)
scored_train_data = add_features(augmented_train_data, sale_info, user_info)

193686
193686
193686
1363590
1363590
1363590


In [308]:
scored_train_data = include_cold_start(scored_train_data, first_orders)
scored_test_data = include_cold_start(scored_test_data, first_orders)


In [309]:
first_orders

Unnamed: 0,user_key,min_start_date
0,tUlux1BiU8TN0CAuzQfxK0AKHwm1,2022-10-29
1,CHaunPhl6uV3BHKZzm9h2RP8vhr2,2023-05-03
2,9irjr3RHbpY3SbqUPsKz8m4fTSN2,2023-03-03
3,zT7DW2ZtlEer7lfHUZ4CRibsPNy1,2022-12-09
4,OE6JC16kmBhU5iyLiW0LX7F2M0k2,2022-06-11
...,...,...
261601,RG4UfpC3g0P1B4MmIO4PnMjrFSI2,2023-03-13
261602,WOlHspx4E1frqMxTSR4HlPdPJbx2,2022-06-11
261603,4wVxfJsY6bdHOqmXBuW9uMU7SwX2,2022-09-05
261604,WeSZbH7q46aWgl1pujUs4K0Lu5d2,2023-01-19


In [316]:
scored_train_data.dropna()

Unnamed: 0,user_key,sale_id,delta,interaction,followers,conversion,revenue,brand_appearance,badges,avg_price,category,start_at,frequency,monetary,recency,category_1,category_2,category_3
0,iOa1WrdzrtPTdxQQjB2GjlFZobK2,3314b19d19ca4af0bea397639,24.0,1,16186.0,0.011353,34885.83600,1.0,{Iconique},5101.596286,Prêt-à-porter,2023-02-22,47.571429,35.757143,88,14,4,10
1,iOa1WrdzrtPTdxQQjB2GjlFZobK2,defa20e8502b417f92d31b40a,24.0,0,14028.0,0.007243,13544.43600,3.0,"{""Made in Europe"",Oeko-Tex}",3832.487724,Kids,2023-02-22,47.571429,35.757143,88,14,4,10
2,67guXdQfXgg4mqrsPUA5zivCT003,8c97f2f8041544a7b1e9c372b,0.0,1,66.0,0.006358,14306.78400,1.0,"{""Made in France"",Savoir-faire}",4347.339286,Buanderie,2023-04-19,15.722222,49.783333,11,19,1,8
3,67guXdQfXgg4mqrsPUA5zivCT003,6ef6ac42e9994f1a970412112,0.0,0,32397.0,0.032416,27403.36800,6.0,"{""Made in France"",Vegan,Naturel}",2215.492917,Beauté,2023-04-19,15.722222,49.783333,11,19,1,8
4,GTthlIB44FdprQrV5y3xBs0O4vE2,7c71abb8621c4199b3b7c50f2,192.0,1,878.0,0.005566,4745.41200,1.0,"{""Fait main"",""Socialement engagée"",Éco-friendl...",3682.465882,Maroquinerie,2023-04-20,94.000000,43.500000,39,3,17,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1322746,OTz9T6LMeVSTjIHYA3PkNE3Qwjm2,a4404f137624429388e072b7d,0.0,0,174281.0,0.009444,37041.06000,3.0,{Tendance},2728.066883,Lingerie,2023-03-28,8.875000,62.325000,9,20,11,12
1322747,RMC0oLOzOXUv1NkhIxWcjIpcII43,e24b934b618247bf8b1ef73c3,168.0,1,20655.0,0.013361,10154.05200,1.0,"{""Made in Europe"",Vegan,Naturel}",3675.055833,Soins,2023-01-18,0.000000,0.000000,0,0,0,0
1322748,RMC0oLOzOXUv1NkhIxWcjIpcII43,1351da9a83c54e20a6f651117,168.0,0,20717.0,0.002107,2443.24800,2.0,"{""Made in France"",""Made in Europe"",Durable,Nat...",4870.554973,Hygiène,2023-01-18,0.000000,0.000000,0,0,0,0
1322749,bH5qBPUK7fWlcaOm4PcouXKbt1g1,d993d446dc9142feb318265ad,48.0,1,13734.0,0.010496,10677.28800,8.0,"{""Made in France"",""Cadeau idéal""}",2564.287371,Bibliothèque,2022-12-17,58.666667,51.566667,168,4,13,17


In [317]:
scored_test_data.to_csv('scored_test_data.csv')
scored_train_data.to_csv('scored_train_data.csv')