In [282]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np
import ast

In [283]:
users = pd.read_csv('https://raw.githubusercontent.com/ardahk/amex/refs/heads/main/data/users_final.csv')
product_embeddings = pd.read_csv('https://github.com/ardahk/amex/raw/refs/heads/main/data/name_embeddings.csv')
products = pd.read_csv('https://github.com/ardahk/amex/raw/refs/heads/main/data/products.csv')
order_items = pd.read_csv('https://github.com/ardahk/amex/raw/refs/heads/main/data/order_items.csv')

In [284]:
product_embeddings.head()

Unnamed: 0,product_id,name_embedding
0,9588,"[-0.4065183401107788, 0.31074419617652893, -0...."
1,1238,"[-0.41240498423576355, 0.5829450488090515, -0...."
2,5485,"[-0.23880276083946228, 0.3456690013408661, -0...."
3,5485,"[-0.23880276083946228, 0.3456690013408661, -0...."
4,5485,"[-0.23880276083946228, 0.3456690013408661, -0...."


In [285]:
products.head()

Unnamed: 0,id,cost,category,name,brand,retail_price,department,sku,distribution_center_id
0,13842,2.51875,Accessories,Low Profile Dyed Cotton Twill Cap - Navy W39S55D,MG,6.25,Women,EBD58B8A3F1D72F4206201DA62FB1204,1
1,13928,2.33835,Accessories,Low Profile Dyed Cotton Twill Cap - Putty W39S55D,MG,5.95,Women,2EAC42424D12436BDD6A5B8A88480CC3,1
2,14115,4.87956,Accessories,Enzyme Regular Solid Army Caps-Black W35S45D,MG,10.99,Women,EE364229B2791D1EF9355708EFF0BA34,1
3,14157,4.64877,Accessories,Enzyme Regular Solid Army Caps-Olive W35S45D (...,MG,10.99,Women,00BD13095D06C20B11A2993CA419D16B,1
4,14273,6.50793,Accessories,Washed Canvas Ivy Cap - Black W11S64C,MG,15.99,Women,F531DC20FDE20B7ADF3A73F52B71D0AF,1


### Merge name embeddings and products dataframe

In [286]:
products_final = pd.merge(left = products, right = product_embeddings, left_on='id', right_on='product_id')

In [287]:
products_final.head()

Unnamed: 0,id,cost,category,name,brand,retail_price,department,sku,distribution_center_id,product_id,name_embedding
0,13928,2.33835,Accessories,Low Profile Dyed Cotton Twill Cap - Putty W39S55D,MG,5.95,Women,2EAC42424D12436BDD6A5B8A88480CC3,1,13928,"[-0.5451024770736694, -0.34113600850105286, -0..."
1,13928,2.33835,Accessories,Low Profile Dyed Cotton Twill Cap - Putty W39S55D,MG,5.95,Women,2EAC42424D12436BDD6A5B8A88480CC3,1,13928,"[-0.5451024770736694, -0.34113600850105286, -0..."
2,14115,4.87956,Accessories,Enzyme Regular Solid Army Caps-Black W35S45D,MG,10.99,Women,EE364229B2791D1EF9355708EFF0BA34,1,14115,"[0.1337980031967163, -0.20477981865406036, 0.0..."
3,14273,6.50793,Accessories,Washed Canvas Ivy Cap - Black W11S64C,MG,15.99,Women,F531DC20FDE20B7ADF3A73F52B71D0AF,1,14273,"[-0.40627333521842957, 0.14057165384292603, -0..."
4,14273,6.50793,Accessories,Washed Canvas Ivy Cap - Black W11S64C,MG,15.99,Women,F531DC20FDE20B7ADF3A73F52B71D0AF,1,14273,"[-0.40627333521842957, 0.14057165384292603, -0..."


In [288]:
products_final.drop(columns = ['id', 'name', 'distribution_center_id', 'sku'], inplace = True)

In [289]:
products['brand'].nunique()

2756

In [290]:
products['category'].nunique()

26

### There are too many unique brands to one-hot encode, so label encoding will be used with a reference table

In [291]:
label_encoder = LabelEncoder()

products_final['brand_encoded'] = label_encoder.fit_transform(products_final['brand'])

In [292]:
brands = pd.DataFrame({
    'brand': label_encoder.classes_,
    'brand_encoded': range(len(label_encoder.classes_))
})
print(brands.head())


           brand  brand_encoded
0      !it Jeans              0
1      '47 Brand              1
2    007Lingerie              2
3      106Shades              3
4  12XLShop Inc.              4


In [293]:
products_final.drop(columns = ['brand'], inplace = True)

In [294]:
products_final.head()

Unnamed: 0,cost,category,retail_price,department,product_id,name_embedding,brand_encoded
0,2.33835,Accessories,5.95,Women,13928,"[-0.5451024770736694, -0.34113600850105286, -0...",1158
1,2.33835,Accessories,5.95,Women,13928,"[-0.5451024770736694, -0.34113600850105286, -0...",1158
2,4.87956,Accessories,10.99,Women,14115,"[0.1337980031967163, -0.20477981865406036, 0.0...",1158
3,6.50793,Accessories,15.99,Women,14273,"[-0.40627333521842957, 0.14057165384292603, -0...",1158
4,6.50793,Accessories,15.99,Women,14273,"[-0.40627333521842957, 0.14057165384292603, -0...",1158


In [295]:
brands.to_csv('brand_reference.csv', index=False)

### Convert department and category to dummy variables

In [296]:
products_final = pd.get_dummies(products_final, columns = ['department', 'category'], dtype=int)

In [297]:
products_final.head()

Unnamed: 0,cost,retail_price,product_id,name_embedding,brand_encoded,department_Men,department_Women,category_Accessories,category_Active,category_Blazers & Jackets,...,category_Skirts,category_Sleep & Lounge,category_Socks,category_Socks & Hosiery,category_Suits,category_Suits & Sport Coats,category_Sweaters,category_Swim,category_Tops & Tees,category_Underwear
0,2.33835,5.95,13928,"[-0.5451024770736694, -0.34113600850105286, -0...",1158,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2.33835,5.95,13928,"[-0.5451024770736694, -0.34113600850105286, -0...",1158,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4.87956,10.99,14115,"[0.1337980031967163, -0.20477981865406036, 0.0...",1158,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,6.50793,15.99,14273,"[-0.40627333521842957, 0.14057165384292603, -0...",1158,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,6.50793,15.99,14273,"[-0.40627333521842957, 0.14057165384292603, -0...",1158,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [298]:
products_final.columns

Index(['cost', 'retail_price', 'product_id', 'name_embedding', 'brand_encoded',
       'department_Men', 'department_Women', 'category_Accessories',
       'category_Active', 'category_Blazers & Jackets',
       'category_Clothing Sets', 'category_Dresses',
       'category_Fashion Hoodies & Sweatshirts', 'category_Intimates',
       'category_Jeans', 'category_Jumpsuits & Rompers', 'category_Leggings',
       'category_Maternity', 'category_Outerwear & Coats', 'category_Pants',
       'category_Pants & Capris', 'category_Plus', 'category_Shorts',
       'category_Skirts', 'category_Sleep & Lounge', 'category_Socks',
       'category_Socks & Hosiery', 'category_Suits',
       'category_Suits & Sport Coats', 'category_Sweaters', 'category_Swim',
       'category_Tops & Tees', 'category_Underwear'],
      dtype='object')

In [299]:
products_final.describe(include='all')

Unnamed: 0,cost,retail_price,product_id,name_embedding,brand_encoded,department_Men,department_Women,category_Accessories,category_Active,category_Blazers & Jackets,...,category_Skirts,category_Sleep & Lounge,category_Socks,category_Socks & Hosiery,category_Suits,category_Suits & Sport Coats,category_Sweaters,category_Swim,category_Tops & Tees,category_Underwear
count,19696.0,19696.0,19696.0,19696,19696.0,19696.0,19696.0,19696.0,19696.0,19696.0,...,19696.0,19696.0,19696.0,19696.0,19696.0,19696.0,19696.0,19696.0,19696.0,19696.0
unique,,,,13347,,,,,,,...,,,,,,,,,,
top,,,,"[-0.4557799696922302, 0.4693703055381775, -0.1...",,,,,,,...,,,,,,,,,,
freq,,,,8,,,,,,,...,,,,,,,,,,
mean,23.259658,47.800429,15314.426533,,986.47893,0.499391,0.500609,0.057575,0.047929,0.014115,...,0.011373,0.063617,0.035337,0.021781,0.004316,0.02437,0.061586,0.065089,0.069303,0.044628
std,16.444463,34.161563,8459.229347,,608.436405,0.500012,0.500012,0.232944,0.21362,0.117966,...,0.106038,0.244076,0.184635,0.145972,0.065553,0.1542,0.240408,0.24669,0.253976,0.206492
min,0.0083,0.02,4.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,11.0,23.950001,8113.0,,460.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,18.788,38.514999,15961.5,,951.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,31.345561,63.75,22680.5,,1483.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [300]:
products_final.dtypes

Unnamed: 0,0
cost,float64
retail_price,float64
product_id,int64
name_embedding,object
brand_encoded,int64
department_Men,int64
department_Women,int64
category_Accessories,int64
category_Active,int64
category_Blazers & Jackets,int64


### Convert name_enbeddings to a list of floats

In [301]:
products_final['name_embedding'] = products_final['name_embedding'].apply(ast.literal_eval)
products_final['name_embedding'] = products_final['name_embedding'].apply(lambda x: np.array(x, dtype=np.float32))

In [302]:
products_final.dtypes

Unnamed: 0,0
cost,float64
retail_price,float64
product_id,int64
name_embedding,object
brand_encoded,int64
department_Men,int64
department_Women,int64
category_Accessories,int64
category_Active,int64
category_Blazers & Jackets,int64


In [303]:
products_final.to_csv('products_final_numeric.csv', index=False)

### Convert all users data to numeric

In [304]:
users.head()

Unnamed: 0,id,age,gender,postal_code,traffic_source,avg_sequence_number,order_id,status,num_of_item,month_day
0,5399,44,M,36303,Organic,5.0,6771,Complete,1,09-05
1,42412,55,F,36303,Facebook,13.0,52934,Processing,4,05-11
2,72320,18,F,36303,Search,6.333333,90679,Complete,2,05-20
3,72320,18,F,36303,Search,6.333333,90680,Shipped,1,09-14
4,88924,38,M,36303,Search,5.0,111336,Cancelled,1,12-13


In [305]:
users_final = pd.get_dummies(users, columns = ['gender', 'status', 'traffic_source'], dtype=int)

### We need to use 'order_items' in order to extract product id. This is used to make the label for the user and items dataset (engineer a feature that indicates whether the user has interacted with the item or not)

In [306]:
order_items.head()

Unnamed: 0,id,order_id,user_id,product_id,inventory_item_id,status,created_at,shipped_at,delivered_at,returned_at,sale_price
0,152013,104663,83582,14235,410368,Cancelled,2023-05-07 06:08:40+00:00,,,,0.02
1,40993,28204,22551,14235,110590,Complete,2023-03-14 03:47:21+00:00,2023-03-15 22:57:00+00:00,2023-03-18 01:08:00+00:00,,0.02
2,51224,35223,28215,14235,138236,Complete,2023-12-05 13:25:30+00:00,2023-12-06 01:20:00+00:00,2023-12-10 10:04:00+00:00,,0.02
3,36717,25278,20165,14235,99072,Shipped,2023-12-22 20:48:19+00:00,2023-12-24 16:44:00+00:00,,,0.02
4,131061,90241,71954,14235,353798,Shipped,2022-06-19 16:57:59+00:00,2022-06-19 19:29:00+00:00,,,0.02


In [307]:
users_final = pd.merge(left = users_final, right = order_items, left_on='id', right_on='user_id')

In [308]:
users_final.columns

Index(['id_x', 'age', 'postal_code', 'avg_sequence_number', 'order_id_x',
       'num_of_item', 'month_day', 'gender_F', 'gender_M', 'status_Cancelled',
       'status_Complete', 'status_Processing', 'status_Returned',
       'status_Shipped', 'traffic_source_Display', 'traffic_source_Email',
       'traffic_source_Facebook', 'traffic_source_Organic',
       'traffic_source_Search', 'id_y', 'order_id_y', 'user_id', 'product_id',
       'inventory_item_id', 'status', 'created_at', 'shipped_at',
       'delivered_at', 'returned_at', 'sale_price'],
      dtype='object')

In [309]:
users_final.drop(columns = ['id_x', 'order_id_x', 'id_y', 'order_id_y', 'user_id', 'inventory_item_id','status', 'created_at', 'shipped_at',
       'delivered_at', 'returned_at', 'sale_price'], inplace = True)

dropping 'month-day' now for formatting purposes, however we can add it back in later if we think it has value

In [313]:
users_final.drop(columns = ['month_day'], inplace = True)

In [314]:
users_final.columns

Index(['age', 'postal_code', 'avg_sequence_number', 'num_of_item', 'gender_F',
       'gender_M', 'status_Cancelled', 'status_Complete', 'status_Processing',
       'status_Returned', 'status_Shipped', 'traffic_source_Display',
       'traffic_source_Email', 'traffic_source_Facebook',
       'traffic_source_Organic', 'traffic_source_Search', 'product_id'],
      dtype='object')

In [315]:
users_final.dtypes

Unnamed: 0,0
age,int64
postal_code,int64
avg_sequence_number,float64
num_of_item,int64
gender_F,int64
gender_M,int64
status_Cancelled,int64
status_Complete,int64
status_Processing,int64
status_Returned,int64


In [316]:
users_final.to_csv('users_final_numeric.csv', index=False)

## Building baseline 2 tower model

In [317]:
users_final.head()

Unnamed: 0,age,postal_code,avg_sequence_number,num_of_item,gender_F,gender_M,status_Cancelled,status_Complete,status_Processing,status_Returned,status_Shipped,traffic_source_Display,traffic_source_Email,traffic_source_Facebook,traffic_source_Organic,traffic_source_Search,product_id
0,44,36303,5.0,1,0,1,0,1,0,0,0,0,0,0,1,0,27958
1,55,36303,13.0,4,1,0,0,0,1,0,0,0,0,1,0,0,7012
2,55,36303,13.0,4,1,0,0,0,1,0,0,0,0,1,0,0,9230
3,55,36303,13.0,4,1,0,0,0,1,0,0,0,0,1,0,0,4653
4,55,36303,13.0,4,1,0,0,0,1,0,0,0,0,1,0,0,3855


In [318]:
users_final.shape

(80000, 17)

In [319]:
users_final.isna().sum()

Unnamed: 0,0
age,0
postal_code,0
avg_sequence_number,0
num_of_item,0
gender_F,0
gender_M,0
status_Cancelled,0
status_Complete,0
status_Processing,0
status_Returned,0


In [320]:
users_final.columns

Index(['age', 'postal_code', 'avg_sequence_number', 'num_of_item', 'gender_F',
       'gender_M', 'status_Cancelled', 'status_Complete', 'status_Processing',
       'status_Returned', 'status_Shipped', 'traffic_source_Display',
       'traffic_source_Email', 'traffic_source_Facebook',
       'traffic_source_Organic', 'traffic_source_Search', 'product_id'],
      dtype='object')

In [321]:
products_final.head()

Unnamed: 0,cost,retail_price,product_id,name_embedding,brand_encoded,department_Men,department_Women,category_Accessories,category_Active,category_Blazers & Jackets,...,category_Skirts,category_Sleep & Lounge,category_Socks,category_Socks & Hosiery,category_Suits,category_Suits & Sport Coats,category_Sweaters,category_Swim,category_Tops & Tees,category_Underwear
0,2.33835,5.95,13928,"[-0.5451025, -0.341136, -0.37329835, -0.315931...",1158,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2.33835,5.95,13928,"[-0.5451025, -0.341136, -0.37329835, -0.315931...",1158,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4.87956,10.99,14115,"[0.133798, -0.20477982, 0.049048603, -0.086982...",1158,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,6.50793,15.99,14273,"[-0.40627334, 0.14057165, -0.2866125, -0.27437...",1158,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,6.50793,15.99,14273,"[-0.40627334, 0.14057165, -0.2866125, -0.27437...",1158,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [322]:
products_final.shape

(19696, 33)

In [323]:
products_final.isna().sum()

Unnamed: 0,0
cost,0
retail_price,0
product_id,0
name_embedding,0
brand_encoded,0
department_Men,0
department_Women,0
category_Accessories,0
category_Active,0
category_Blazers & Jackets,0


### The first issue is that for each training batch, we need to have the same amount of user-item pairs as input. This means we need to use some sort of sampling for each batch in order to make sure they're both the same size.

In [346]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Concatenate, Dot
from tensorflow.keras.models import Model

In [347]:
user_input = Input(shape=(16,), name='user_input')
item_input = Input(shape=(31,), name='item_input')

In [348]:
user_tower = Dense(64, activation='relu')(user_input)
user_tower = Dense(64, activation='relu')(user_tower)

In [349]:
item_tower = Dense(64, activation='relu')(item_input)
item_tower = Dense(64, activation='relu')(item_tower)

In [350]:
dot_product = Dot(axes=1)([user_tower, item_tower])

In [351]:
model = Model(inputs=[user_input, item_input], outputs=dot_product)

In [352]:
model.compile(optimizer='adam', loss='mse')

In [353]:
model.summary()

### Formatting inputs

In [356]:
def create_labels_and_train(users_df, products_df, model, batch_size, num_epochs):
    for epoch in range(num_epochs):
        # generate random user-item pairs through random indices for each batch
        user_indices = np.random.randint(0, len(users_df), size=batch_size)
        product_indices = np.random.randint(0, len(products_df), size=batch_size)

        # extract the data
        user_data = users_df.iloc[user_indices]#.copy()  # copy to avoid SettingWithCopyWarning
        product_data = products_df.iloc[product_indices]#.copy()

        # we will be creating target similarity labels
        target_similarity = []

        # loop through user and product indices to create labels
        for user_idx, product_idx in zip(user_indices, product_indices):
            user_product_id = users_df.iloc[user_idx]['product_id']
            item_product_id = products_df.iloc[product_idx]['product_id']

            # if the user and item product id match, it means the user purchased the product
            # otherwise, there is no interaction and the target similarity would be 0
            target_similarity.append(1 if user_product_id == item_product_id else 0)

        # convert to a numpy array
        target_similarity = np.array(target_similarity)

        # drop 'product_id' from both dataframes
        user_data = user_data.drop(columns=['product_id'])
        product_data = product_data.drop(columns=['product_id', 'name_embedding'])


        # train the model with the pairs
        model.fit([user_data.values, product_data.values], target_similarity, epochs=1, batch_size=batch_size)

# parameters
batch_size = 500
num_epochs = 25

create_labels_and_train(users_final, products_final, model, batch_size, num_epochs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - loss: 1048978063360.0000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step - loss: 949033762816.0000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step - loss: 560331096064.0000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step - loss: 366262910976.0000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - loss: 319499960320.0000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step - loss: 247981834240.0000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - loss: 190306992128.0000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - loss: 133274378240.0000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - loss: 92363513856.0000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step - loss: 77075873792.0000
[1m1/1[0m [32m━━━━

so it works, but I needed to drop item embeddings (due to a formatting issue - converted it to a list of floats however it may need to be flattened?? unsure)
Also unsure of how to interpret the loss (seems realllyyyyy big)
Definitely need to do more research however it works!!!