In [441]:
import pandas as pd
import numpy as np
import ast
from sklearn.decomposition import PCA

In [442]:
product_embeddings = pd.read_csv("https://github.com/ardahk/amex/raw/refs/heads/main/data/name_embeddings.csv")
brand_embeddings = pd.read_csv("https://github.com/ardahk/amex/raw/refs/heads/main/data/brand_embeddings.csv")
products = pd.read_csv('https://github.com/ardahk/amex/raw/refs/heads/main/data/products.csv')
order_items = pd.read_csv('https://github.com/ardahk/amex/raw/refs/heads/main/data/order_items.csv')
users = pd.read_csv('https://raw.githubusercontent.com/ardahk/amex/refs/heads/main/data/users_final.csv')

# TO DO: FORMATTING DATA
## Products
- Merge product name & brand embeddings to products dataframe
- Convert 'department' and 'category' to dummy variables
- Flatten all embeddings so they're able to be used as input to the model
- Drop product_id and distribution_center_id

## Users
- Convert 'gender', 'status', and 'traffic source' to dummy variables
- Merge 'order_items' and 'users' to be able to extract 'product_id', in order to be able to create the label
- Drop user_id, order_id, and inventory_item_id


## PRODUCTS

In [443]:
brand_embeddings.head()

Unnamed: 0,brand,brand_embedding
0,MG,"[0.445360004901886, 0.5872200131416321, 1.2546..."
1,MG,"[0.445360004901886, 0.5872200131416321, 1.2546..."
2,MG,"[0.445360004901886, 0.5872200131416321, 1.2546..."
3,MG,"[0.445360004901886, 0.5872200131416321, 1.2546..."
4,MG,"[0.445360004901886, 0.5872200131416321, 1.2546..."


In [444]:
product_embeddings.head()

Unnamed: 0,product_id,name_embedding
0,9588,"[-0.4065183401107788, 0.31074419617652893, -0...."
1,1238,"[-0.41240498423576355, 0.5829450488090515, -0...."
2,5485,"[-0.23880276083946228, 0.3456690013408661, -0...."
3,5485,"[-0.23880276083946228, 0.3456690013408661, -0...."
4,5485,"[-0.23880276083946228, 0.3456690013408661, -0...."


issue is that the name embeddings are formatted as strings, so we need to convert them to lists, then a python numpy array

In [445]:
product_embeddings['name_embedding'] = product_embeddings['name_embedding'].apply(lambda x: np.array(ast.literal_eval(x)))
product_embeddings['flattened_name_embedding'] = product_embeddings['name_embedding'].apply(lambda x: np.array(x))
product_embeddings.head()

Unnamed: 0,product_id,name_embedding,flattened_name_embedding
0,9588,"[-0.4065183401107788, 0.31074419617652893, -0....","[-0.4065183401107788, 0.31074419617652893, -0...."
1,1238,"[-0.41240498423576355, 0.5829450488090515, -0....","[-0.41240498423576355, 0.5829450488090515, -0...."
2,5485,"[-0.23880276083946228, 0.3456690013408661, -0....","[-0.23880276083946228, 0.3456690013408661, -0...."
3,5485,"[-0.23880276083946228, 0.3456690013408661, -0....","[-0.23880276083946228, 0.3456690013408661, -0...."
4,5485,"[-0.23880276083946228, 0.3456690013408661, -0....","[-0.23880276083946228, 0.3456690013408661, -0...."


In [446]:
brand_embeddings['brand_embedding'] = brand_embeddings['brand_embedding'].apply(lambda x: np.array(ast.literal_eval(x)))
brand_embeddings['flattened_brand_embedding'] = brand_embeddings['brand_embedding'].apply(lambda x: np.array(x))
brand_embeddings.head()

Unnamed: 0,brand,brand_embedding,flattened_brand_embedding
0,MG,"[0.445360004901886, 0.5872200131416321, 1.2546...","[0.445360004901886, 0.5872200131416321, 1.2546..."
1,MG,"[0.445360004901886, 0.5872200131416321, 1.2546...","[0.445360004901886, 0.5872200131416321, 1.2546..."
2,MG,"[0.445360004901886, 0.5872200131416321, 1.2546...","[0.445360004901886, 0.5872200131416321, 1.2546..."
3,MG,"[0.445360004901886, 0.5872200131416321, 1.2546...","[0.445360004901886, 0.5872200131416321, 1.2546..."
4,MG,"[0.445360004901886, 0.5872200131416321, 1.2546...","[0.445360004901886, 0.5872200131416321, 1.2546..."


In [447]:
brand_embeddings.drop(columns = 'brand_embedding', inplace = True)

In [448]:
product_embeddings.drop(columns = 'name_embedding', inplace = True)

In [449]:
products.head()

Unnamed: 0,id,cost,category,name,brand,retail_price,department,sku,distribution_center_id
0,13842,2.51875,Accessories,Low Profile Dyed Cotton Twill Cap - Navy W39S55D,MG,6.25,Women,EBD58B8A3F1D72F4206201DA62FB1204,1
1,13928,2.33835,Accessories,Low Profile Dyed Cotton Twill Cap - Putty W39S55D,MG,5.95,Women,2EAC42424D12436BDD6A5B8A88480CC3,1
2,14115,4.87956,Accessories,Enzyme Regular Solid Army Caps-Black W35S45D,MG,10.99,Women,EE364229B2791D1EF9355708EFF0BA34,1
3,14157,4.64877,Accessories,Enzyme Regular Solid Army Caps-Olive W35S45D (...,MG,10.99,Women,00BD13095D06C20B11A2993CA419D16B,1
4,14273,6.50793,Accessories,Washed Canvas Ivy Cap - Black W11S64C,MG,15.99,Women,F531DC20FDE20B7ADF3A73F52B71D0AF,1


In [450]:
products.shape

(29120, 9)

- merge product name & brand embeddings to products dataframe

In [451]:
products_final = pd.merge(left = products, right = product_embeddings, left_on='id', right_on='product_id')

In [452]:
products_final = pd.merge(left = products_final, right = brand_embeddings, on='brand')

In [453]:
products_final = products_final.drop_duplicates(subset='product_id')

- convert 'department' and 'category' to dummy variables

In [454]:
products_final = pd.get_dummies(products_final, columns = ['department', 'category'], dtype=int)

- drop product_id, distribution_center_id, and any initial features which were converted

In [455]:
products_final.drop(columns = ['id', 'name', 'distribution_center_id', 'sku', 'brand'], inplace = True)

In [456]:
products_final.head()

Unnamed: 0,cost,retail_price,product_id,flattened_name_embedding,flattened_brand_embedding,department_Men,department_Women,category_Accessories,category_Active,category_Blazers & Jackets,...,category_Skirts,category_Sleep & Lounge,category_Socks,category_Socks & Hosiery,category_Suits,category_Suits & Sport Coats,category_Sweaters,category_Swim,category_Tops & Tees,category_Underwear
0,2.33835,5.95,13928,"[-0.5451024770736694, -0.34113600850105286, -0...","[0.445360004901886, 0.5872200131416321, 1.2546...",0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
26,4.87956,10.99,14115,"[0.1337980031967163, -0.20477981865406036, 0.0...","[0.445360004901886, 0.5872200131416321, 1.2546...",0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
39,6.50793,15.99,14273,"[-0.40627333521842957, 0.14057165384292603, -0...","[0.445360004901886, 0.5872200131416321, 1.2546...",0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
65,3.10625,6.25,15674,"[-0.515715479850769, -0.21141724288463593, -0....","[0.445360004901886, 0.5872200131416321, 1.2546...",0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
78,2.67594,6.18,28670,"[-0.6392737627029419, -0.10372474789619446, -0...","[0.445360004901886, 0.5872200131416321, 1.2546...",1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [457]:
products_final.shape

(13961, 33)

In [458]:
products_final.dtypes

Unnamed: 0,0
cost,float64
retail_price,float64
product_id,int64
flattened_name_embedding,object
flattened_brand_embedding,object
department_Men,int64
department_Women,int64
category_Accessories,int64
category_Active,int64
category_Blazers & Jackets,int64


In [459]:
products_final.columns

Index(['cost', 'retail_price', 'product_id', 'flattened_name_embedding',
       'flattened_brand_embedding', 'department_Men', 'department_Women',
       'category_Accessories', 'category_Active', 'category_Blazers & Jackets',
       'category_Clothing Sets', 'category_Dresses',
       'category_Fashion Hoodies & Sweatshirts', 'category_Intimates',
       'category_Jeans', 'category_Jumpsuits & Rompers', 'category_Leggings',
       'category_Maternity', 'category_Outerwear & Coats', 'category_Pants',
       'category_Pants & Capris', 'category_Plus', 'category_Shorts',
       'category_Skirts', 'category_Sleep & Lounge', 'category_Socks',
       'category_Socks & Hosiery', 'category_Suits',
       'category_Suits & Sport Coats', 'category_Sweaters', 'category_Swim',
       'category_Tops & Tees', 'category_Underwear'],
      dtype='object')

## USERS

- Convert 'gender', 'status', and 'traffic source' to dummy variables

In [460]:
users_final = pd.get_dummies(users, columns = ['gender', 'status', 'traffic_source'], dtype = int)

- Merge 'order_items' and 'users' to be able to extract 'product_id', in order to be able to create the label

In [461]:
users_final = pd.merge(left = users_final, right = order_items[['user_id', 'product_id']], left_on= 'id', right_on = 'user_id')

- Drop user_id, order_id, and inventory_item_id

In [462]:
users_final.drop(columns = ['id', 'order_id', 'month_day'], inplace = True)

In [463]:
users_final.head()

Unnamed: 0,age,postal_code,avg_sequence_number,num_of_item,gender_F,gender_M,status_Cancelled,status_Complete,status_Processing,status_Returned,status_Shipped,traffic_source_Display,traffic_source_Email,traffic_source_Facebook,traffic_source_Organic,traffic_source_Search,user_id,product_id
0,44,36303,5.0,1,0,1,0,1,0,0,0,0,0,0,1,0,5399,27958
1,55,36303,13.0,4,1,0,0,0,1,0,0,0,0,1,0,0,42412,7012
2,55,36303,13.0,4,1,0,0,0,1,0,0,0,0,1,0,0,42412,9230
3,55,36303,13.0,4,1,0,0,0,1,0,0,0,0,1,0,0,42412,4653
4,55,36303,13.0,4,1,0,0,0,1,0,0,0,0,1,0,0,42412,3855


In [464]:
users_final.columns

Index(['age', 'postal_code', 'avg_sequence_number', 'num_of_item', 'gender_F',
       'gender_M', 'status_Cancelled', 'status_Complete', 'status_Processing',
       'status_Returned', 'status_Shipped', 'traffic_source_Display',
       'traffic_source_Email', 'traffic_source_Facebook',
       'traffic_source_Organic', 'traffic_source_Search', 'user_id',
       'product_id'],
      dtype='object')

### Check and make sure all columns are integers

In [465]:
users_final.dtypes

Unnamed: 0,0
age,int64
postal_code,int64
avg_sequence_number,float64
num_of_item,int64
gender_F,int64
gender_M,int64
status_Cancelled,int64
status_Complete,int64
status_Processing,int64
status_Returned,int64


In [466]:
products_final.dtypes

Unnamed: 0,0
cost,float64
retail_price,float64
product_id,int64
flattened_name_embedding,object
flattened_brand_embedding,object
department_Men,int64
department_Women,int64
category_Accessories,int64
category_Active,int64
category_Blazers & Jackets,int64


Everything is an integer except for the embeddings, which are objects but we will check and make sure they can be used as input for the model since they were converted from strings to numpy arrays.

# PCA

for some reason, nothing could make these embeddings as input into the model. so therefore PCA.

# MODEL

I encountered an issue when trying to engineer the label, some product IDs exist only in the products or the users table.

In [467]:
unique_in_users = set(users_final['product_id']) - set(products_final['product_id'])
print("Number of ID's just in users: ", len(unique_in_users))

unique_in_products = set(products_final['product_id']) - set(users_final['product_id'])
print("Number of ID's just in products: ", len(unique_in_products))

Number of ID's just in users:  7638
Number of ID's just in products:  161


Therefore we will only keep instances where : Users have purchased a product that has existed within our database, and products have been purchased by a user that exists within our database

In [468]:
common_ids = set(users_final['product_id']).intersection(set(products_final['product_id']))
# filter dataframe to only include the intersection
users_final = users_final[users_final['product_id'].isin(common_ids)]
products_final = products_final[products_final['product_id'].isin(common_ids)]

In [469]:
users_final.shape

(57696, 18)

In [470]:
users_final.columns

Index(['age', 'postal_code', 'avg_sequence_number', 'num_of_item', 'gender_F',
       'gender_M', 'status_Cancelled', 'status_Complete', 'status_Processing',
       'status_Returned', 'status_Shipped', 'traffic_source_Display',
       'traffic_source_Email', 'traffic_source_Facebook',
       'traffic_source_Organic', 'traffic_source_Search', 'user_id',
       'product_id'],
      dtype='object')

In [471]:
products_final.head()

Unnamed: 0,cost,retail_price,product_id,flattened_name_embedding,flattened_brand_embedding,department_Men,department_Women,category_Accessories,category_Active,category_Blazers & Jackets,...,category_Skirts,category_Sleep & Lounge,category_Socks,category_Socks & Hosiery,category_Suits,category_Suits & Sport Coats,category_Sweaters,category_Swim,category_Tops & Tees,category_Underwear
0,2.33835,5.95,13928,"[-0.5451024770736694, -0.34113600850105286, -0...","[0.445360004901886, 0.5872200131416321, 1.2546...",0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
26,4.87956,10.99,14115,"[0.1337980031967163, -0.20477981865406036, 0.0...","[0.445360004901886, 0.5872200131416321, 1.2546...",0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
39,6.50793,15.99,14273,"[-0.40627333521842957, 0.14057165384292603, -0...","[0.445360004901886, 0.5872200131416321, 1.2546...",0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
65,3.10625,6.25,15674,"[-0.515715479850769, -0.21141724288463593, -0....","[0.445360004901886, 0.5872200131416321, 1.2546...",0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
78,2.67594,6.18,28670,"[-0.6392737627029419, -0.10372474789619446, -0...","[0.445360004901886, 0.5872200131416321, 1.2546...",1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [472]:
products_final.reset_index(drop=True, inplace=True)

In [473]:
products_final.head()

Unnamed: 0,cost,retail_price,product_id,flattened_name_embedding,flattened_brand_embedding,department_Men,department_Women,category_Accessories,category_Active,category_Blazers & Jackets,...,category_Skirts,category_Sleep & Lounge,category_Socks,category_Socks & Hosiery,category_Suits,category_Suits & Sport Coats,category_Sweaters,category_Swim,category_Tops & Tees,category_Underwear
0,2.33835,5.95,13928,"[-0.5451024770736694, -0.34113600850105286, -0...","[0.445360004901886, 0.5872200131416321, 1.2546...",0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,4.87956,10.99,14115,"[0.1337980031967163, -0.20477981865406036, 0.0...","[0.445360004901886, 0.5872200131416321, 1.2546...",0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,6.50793,15.99,14273,"[-0.40627333521842957, 0.14057165384292603, -0...","[0.445360004901886, 0.5872200131416321, 1.2546...",0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3.10625,6.25,15674,"[-0.515715479850769, -0.21141724288463593, -0....","[0.445360004901886, 0.5872200131416321, 1.2546...",0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2.67594,6.18,28670,"[-0.6392737627029419, -0.10372474789619446, -0...","[0.445360004901886, 0.5872200131416321, 1.2546...",1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [474]:
products_final.shape

(13800, 33)

In [475]:
products_final.columns

Index(['cost', 'retail_price', 'product_id', 'flattened_name_embedding',
       'flattened_brand_embedding', 'department_Men', 'department_Women',
       'category_Accessories', 'category_Active', 'category_Blazers & Jackets',
       'category_Clothing Sets', 'category_Dresses',
       'category_Fashion Hoodies & Sweatshirts', 'category_Intimates',
       'category_Jeans', 'category_Jumpsuits & Rompers', 'category_Leggings',
       'category_Maternity', 'category_Outerwear & Coats', 'category_Pants',
       'category_Pants & Capris', 'category_Plus', 'category_Shorts',
       'category_Skirts', 'category_Sleep & Lounge', 'category_Socks',
       'category_Socks & Hosiery', 'category_Suits',
       'category_Suits & Sport Coats', 'category_Sweaters', 'category_Swim',
       'category_Tops & Tees', 'category_Underwear'],
      dtype='object')

In [476]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Concatenate, Dot, BatchNormalization
from tensorflow.keras.models import Model

In [477]:
user_input = Input(shape=(16,), name='user_input')
item_input = Input(shape=(30,), name='item_input')

In [478]:
user_tower = Dense(128, activation='relu')(user_input)
user_tower = BatchNormalization()(user_tower)
item_tower = Dense(128, activation='relu')(item_input)
item_tower = BatchNormalization()(item_tower)

In [479]:
dot_product = Dot(axes=1, normalize=False, name="dot_product")([user_tower, item_tower])

In [480]:
output = Dense(1, activation='sigmoid', name="output")(dot_product)

In [481]:
model = Model(inputs=[user_input, item_input], outputs=output)

In [487]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['precision'])

In [488]:
model.summary()

In [493]:
def create_labels_and_train(users_df, products_df, model, batch_size, num_epochs):
    for epoch in range(num_epochs):
        # initilize the target similarity for the bath
        target_similarity = []

        # we're making the target similarity balanced, so there's an equal number of posivie and negetive indices in each batch
        num_indices = batch_size // 2

        # generating 1/2 batch size of random pairs, where there are posivive indices (user and product have the same ID)
        positive_user_indices = np.random.randint(0, len(users_df), size=num_indices)
        # initizie storage of positive indicies
        positive_product_indices = []
        # loop over every user
        for user_idx in positive_user_indices:
            # locating product IDs in the user dataframe for the user we sampled
            user_product_id = users_df.iloc[user_idx]['product_id']
            # finding matching products in the products dataframe
            matching_products = products_df[products_df['product_id'] == user_product_id]
            # append the matching product to the positive product indices
            positive_product_indices.append(matching_products.index[0])

        # Generate random negative pairs (user and product have different product_ids)
        negative_user_indices = np.random.randint(0, len(users_df), size=num_indices)
        #print("NEGATIVE USER INDICES: ", negative_user_indices)
        negative_product_indices = []
        for user_idx in negative_user_indices:
            user_product_id = users_df.iloc[user_idx]['product_id']
            # find a product that doesn't have a matching product id
            non_matching_products = products_df[products_df['product_id'] != user_product_id]
            # append that to the negetive indicies
            negative_product_indices.append(non_matching_products.sample(1).index[0])

        # combining both positive and negetive indicies
        user_indices = np.concatenate([positive_user_indices, negative_user_indices])
        product_indices = np.concatenate([positive_product_indices, negative_product_indices])

        # create target similarity labels for the positive and negetive pairs
        target_similarity.extend([1] * num_indices)  # Positive pairs
        target_similarity.extend([0] * num_indices)  # Negative pairs
        target_similarity = np.array(target_similarity)

        # get the positive & negetive user data
        user_data = users_df.iloc[user_indices]
        user_ids = user_data['user_id'].tolist()
        product_data = products_df.iloc[product_indices]
        item_ids = product_data['product_id'].tolist()

        user_data = user_data.drop(columns=['product_id', 'user_id'])
        product_data = product_data.drop(columns=['product_id', 'flattened_name_embedding', 'flattened_brand_embedding'])

        # Train the model with the pairs
        model.fit([user_data, product_data], target_similarity, epochs=1, batch_size=batch_size)
        predicted_probabilities = model.predict([user_data, product_data]).flatten()
        user_item_predictions = list(zip(user_ids, item_ids, predicted_probabilities, target_similarity))
        for user_id, item_id, predicted_prob, target_sim in user_item_predictions:
            print(f"User ID: {user_id}, Item ID: {item_id}, Predicted Probability: {predicted_prob:.4f}, Target Similarity: {target_sim}")

# Parameters
batch_size = 250
num_epochs = 15

create_labels_and_train(users_final, products_final, model, batch_size, num_epochs)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - loss: 2.5158 - precision: 0.4958
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
Epoch 1, Precision at 5: 0.2000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - loss: 3.1535 - precision: 0.4720
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
Epoch 2, Precision at 5: 0.2000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - loss: 2.5556 - precision: 0.4553
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
Epoch 3, Precision at 5: 0.2000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - loss: 2.7431 - precision: 0.4516
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
Epoch 4, Precision at 5: 0.2000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - loss: 2.2515 - precision: 0.5556
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0

In [490]:
# Assuming you have the predictions from above
predicted_probabilities = predictions.flatten()  # Flatten to 1D array of probabilities

# Combine user, item, and predicted probability
user_item_predictions = list(zip(user_ids, item_ids, predicted_probabilities))

# Example of printing the first few predictions
for user, item, prob in user_item_predictions[:10]:
    print(f"User: {user}, Item: {item}, Predicted Probability: {prob}")


NameError: name 'predictions' is not defined